In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
# Load the dataset

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "target"], axis=1)
y_train = df_train["target"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
# Perform cross-validation and calculate the AUC ROC

def evaluate_model(model, X, y, n_splits=5):
    auc_roc_scores = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    for train_index, test_index in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred_proba_cv = model.predict_proba(X_test_cv)[:, 1]

        auc_roc = roc_auc_score(y_test_cv, y_pred_proba_cv)
        auc_roc_scores.append(auc_roc)
    
    return auc_roc_scores

In [5]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMClassifier(random_state=5),
    "xGBoost": xgb.XGBClassifier(random_state=5),
    "CatBoost": CatBoostClassifier(silent=True, random_state=5),
    "RandomForest": RandomForestClassifier(random_state=5),
    "KNN": KNeighborsClassifier(),
}

In [6]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.7565011820330969, 0.7672955974842768, 0.7641843971631206, 0.782608695652174, 0.6868872549019608]
Average AUC ROC: 0.75150
Std Deviation: 0.03340

Model: xGBoost
AUC ROC Scores: [0.7452718676122931, 0.7628930817610062, 0.7446808510638298, 0.8025851938895417, 0.6948529411764706]
Average AUC ROC: 0.75006
Std Deviation: 0.03471

Model: CatBoost
AUC ROC Scores: [0.7777777777777778, 0.7742138364779874, 0.743498817966903, 0.826674500587544, 0.7555147058823529]
Average AUC ROC: 0.77554
Std Deviation: 0.02846

Model: RandomForest
AUC ROC Scores: [0.7724586288416075, 0.8254716981132075, 0.7632978723404256, 0.8116921269095182, 0.7677696078431372]
Average AUC ROC: 0.78814
Std Deviation: 0.02540

Model: KNN
AUC ROC Scores: [0.610224586288416, 0.6550314465408805, 0.6202718676122931, 0.6815511163337251, 0.5557598039215687]
Average AUC ROC: 0.62457
Std Deviation: 0.04274



### OBSERVATION FOR 5 SPLITS
1. Baseline LightGBM is 0.75150 with Std Dev of 0.03340
2. Baseline xGBoost is 0.75006 with Std Dev of 0.03471
3. Baseline CatBoost is 0.77554 with Std Dev of 0.2846
4. Baseline RandomForest is 0.78814 with Std Dev of 0.2540
5. Baseline KNN is 0.62457 with Std Dev of 0.4274

Run time ~ 8 seconds

In [7]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train, n_splits=10)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.673611111111111, 0.787037037037037, 0.7749999999999999, 0.8141176470588236, 0.730952380952381, 0.71, 0.7794117647058824, 0.8595238095238096, 0.625, 0.7132352941176471]
Average AUC ROC: 0.74679
Std Deviation: 0.06606

Model: xGBoost
AUC ROC Scores: [0.6805555555555556, 0.7847222222222222, 0.7611111111111111, 0.8188235294117647, 0.7357142857142857, 0.705, 0.7843137254901961, 0.8380952380952381, 0.6642156862745098, 0.6740196078431373]
Average AUC ROC: 0.74466
Std Deviation: 0.05908

Model: CatBoost
AUC ROC Scores: [0.6875, 0.837962962962963, 0.7416666666666667, 0.8352941176470589, 0.7119047619047619, 0.7275, 0.8088235294117647, 0.8714285714285714, 0.7450980392156863, 0.7769607843137255]
Average AUC ROC: 0.77441
Std Deviation: 0.05826

Model: RandomForest
AUC ROC Scores: [0.6712962962962963, 0.8645833333333333, 0.7888888888888889, 0.8423529411764705, 0.7547619047619047, 0.70875, 0.823529411764706, 0.8607142857142857, 0.7181372549019608, 0.7916666666666667

### OBSERVATION FOR 10 SPLITS
1. Baseline LightGBM is 0.74679 with Std Dev of 0.06606
2. Baseline xGBoost is 0.74466 with Std Dev of 0.05908
3. Baseline CatBoost is 0.77441 with Std Dev of 0.05826
4. Baseline RandomForest is 0.78247 with Std Dev of 0.06400
5. Baseline KNN is 0.63987 with Std Dev of 0.10762

Run time ~ 22 seconds