In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
# Load the dataset

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
# Round the pH column to nearest integar

df_train["ph"] = df_train["ph"].round(0)
df_test["ph"] = df_test["ph"].round(0)
df_test["ph"].unique()

array([5., 6., 7., 8.])

In [4]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "target"], axis=1)
y_train = df_train["target"]

X_test = df_test.drop(["id"], axis=1)

In [5]:
# Perform cross-validation and calculate the AUC ROC

def evaluate_model(model, X, y, n_splits=5):
    auc_roc_scores = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    for train_index, test_index in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred_proba_cv = model.predict_proba(X_test_cv)[:, 1]

        auc_roc = roc_auc_score(y_test_cv, y_pred_proba_cv)
        auc_roc_scores.append(auc_roc)
    
    return [round(value, 5) for value in auc_roc_scores]

In [6]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMClassifier(random_state=5),
    "xGBoost": xgb.XGBClassifier(random_state=5),
    "CatBoost": CatBoostClassifier(silent=True, random_state=5),
    "RandomForest": RandomForestClassifier(random_state=5),
    "KNN": KNeighborsClassifier(),
}

In [7]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.74498, 0.75597, 0.76005, 0.77321, 0.68352]
Average AUC ROC: 0.74355
Std Deviation: 0.03134

Model: xGBoost
AUC ROC Scores: [0.763, 0.74403, 0.75887, 0.75147, 0.66391]
Average AUC ROC: 0.73626
Std Deviation: 0.03675

Model: CatBoost
AUC ROC Scores: [0.75946, 0.77296, 0.73641, 0.81727, 0.76256]
Average AUC ROC: 0.76973
Std Deviation: 0.02660

Model: RandomForest
AUC ROC Scores: [0.76773, 0.79182, 0.74911, 0.82638, 0.75613]
Average AUC ROC: 0.77823
Std Deviation: 0.02811

Model: KNN
AUC ROC Scores: [0.60697, 0.65503, 0.61584, 0.68155, 0.55576]
Average AUC ROC: 0.62303
Std Deviation: 0.04309



### OBSERVATION FOR 5 SPLITS
1. Baseline LightGBM is 0.74355 with Std Dev of 0.03134
2. Baseline xGBoost is 0.73626 with Std Dev of 0.03675
3. Baseline CatBoost is 0.76973 with Std Dev of 0.02660
4. Baseline RandomForest is 0.77823 with Std Dev of 0.02811
5. Baseline KNN is 0.62303 with Std Dev of 0.04309

Run time ~ 8 seconds

In [8]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train, n_splits=10)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.63657, 0.79861, 0.74444, 0.80941, 0.68095, 0.7375, 0.77941, 0.83095, 0.67647, 0.71078]
Average AUC ROC: 0.74051
Std Deviation: 0.06097

Model: xGBoost
AUC ROC Scores: [0.68287, 0.77778, 0.72222, 0.83294, 0.67143, 0.7175, 0.76225, 0.8119, 0.65441, 0.68873]
Average AUC ROC: 0.73220
Std Deviation: 0.05819

Model: CatBoost
AUC ROC Scores: [0.67824, 0.83565, 0.73889, 0.86353, 0.69048, 0.7225, 0.79167, 0.86667, 0.76471, 0.76716]
Average AUC ROC: 0.77195
Std Deviation: 0.06390

Model: RandomForest
AUC ROC Scores: [0.63079, 0.8588, 0.76667, 0.84824, 0.73214, 0.7125, 0.82966, 0.83929, 0.72304, 0.7451]
Average AUC ROC: 0.76862
Std Deviation: 0.07022

Model: KNN
AUC ROC Scores: [0.45949, 0.75926, 0.60694, 0.75176, 0.66905, 0.54375, 0.67525, 0.75833, 0.47059, 0.69853]
Average AUC ROC: 0.63929
Std Deviation: 0.10856



### OBSERVATION FOR 10 SPLITS
1. Baseline LightGBM is 0.74051 with Std Dev of 0.06097
2. Baseline xGBoost is 0.73220 with Std Dev of 0.05819
3. Baseline CatBoost is 0.77195 with Std Dev of 0.06390
4. Baseline RandomForest is 0.76862 with Std Dev of 0.07022
5. Baseline KNN is 0.63929 with Std Dev of 0.10856

Run time ~ 22 seconds

In [9]:
rf = RandomForestClassifier(random_state=5)
rf.fit(X_train, y_train)

In [10]:
# Predit the final submission with Random Forest

y_final = rf.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissionrf.csv", index=False)

In [11]:
cat = CatBoostClassifier(silent=True, random_state=5)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x226f68348b0>

In [12]:
# Predict the final submission with CatBoost

y_final = cat.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissioncat.csv", index=False)