In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
# Load the dataset

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.40,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1
...,...,...,...,...,...,...,...,...
409,409,1.011,5.21,527,21.4,75,1.53,0
410,410,1.024,5.53,577,19.7,224,0.77,0
411,411,1.018,6.28,455,22.2,270,7.68,1
412,412,1.008,7.12,325,12.6,75,1.03,1


In [3]:
# One-hot encode using pd.get_dummies on train and test
# Split back after encoding

# Tag the data before combining
df_train["dataset"] = "train"
df_test["dataset"] = "test"

# Combine train and test data
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# One-hot encode the combined data
df_encoded = pd.get_dummies(df_combined, columns=["gravity"])

# Split the data back into train and test
df_train = df_encoded[df_encoded["dataset"] == "train"].drop(["dataset"], axis=1)
df_test = df_encoded[df_encoded["dataset"] == "test"].drop(["dataset", "target"], axis=1)

In [4]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "target"], axis=1)
y_train = df_train["target"]

X_test = df_test.drop(["id"], axis=1)

In [5]:
# Perform cross-validation and calculate the AUC ROC

def evaluate_model(model, X, y, n_splits=5):
    auc_roc_scores = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    for train_index, test_index in kf.split(X):
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred_proba_cv = model.predict_proba(X_test_cv)[:, 1]

        auc_roc = roc_auc_score(y_test_cv, y_pred_proba_cv)
        auc_roc_scores.append(auc_roc)
    
    return [round(value, 5) for value in auc_roc_scores]

In [6]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMClassifier(random_state=5),
    "xGBoost": xgb.XGBClassifier(random_state=5),
    "CatBoost": CatBoostClassifier(silent=True, random_state=5),
    "RandomForest": RandomForestClassifier(random_state=5),
    "KNN": KNeighborsClassifier(),
}

In [7]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.75296, 0.78679, 0.74882, 0.8114, 0.66728]
Average AUC ROC: 0.75345
Std Deviation: 0.04882

Model: xGBoost
AUC ROC Scores: [0.74113, 0.80503, 0.73641, 0.81786, 0.66422]
Average AUC ROC: 0.75293
Std Deviation: 0.05516

Model: CatBoost
AUC ROC Scores: [0.78369, 0.77987, 0.7234, 0.84606, 0.75858]
Average AUC ROC: 0.77832
Std Deviation: 0.04006

Model: RandomForest
AUC ROC Scores: [0.77246, 0.80723, 0.7435, 0.81052, 0.75398]
Average AUC ROC: 0.77754
Std Deviation: 0.02724

Model: KNN
AUC ROC Scores: [0.61022, 0.66164, 0.62116, 0.68155, 0.55576]
Average AUC ROC: 0.62607
Std Deviation: 0.04373



### OBSERVATION FOR 5 SPLITS
1. Baseline LightGBM is 0.75345 with Std Dev of 0.04882
2. Baseline xGBoost is 0.75293 with Std Dev of 0.05516
3. Baseline CatBoost is 0.77832 with Std Dev of 0.04006
4. Baseline RandomForest is 0.77754 with Std Dev of 0.02724
5. Baseline KNN is 0.62607 with Std Dev of 0.04373

Run time ~ 15 seconds

In [8]:
for name, model in models.items():
    auc_roc_scores = evaluate_model(model, X_train, y_train, n_splits=10)
    mean_roc_auc = np.mean(auc_roc_scores)
    std = np.std(auc_roc_scores)

    print(f"Model: {name}")
    print(f"AUC ROC Scores: {auc_roc_scores}")
    print(f"Average AUC ROC: {mean_roc_auc:.5f}")
    print(f"Std Deviation: {std:.5F}")
    print()

Model: LightGBM
AUC ROC Scores: [0.66204, 0.80556, 0.81944, 0.79529, 0.77619, 0.6975, 0.77696, 0.87381, 0.61765, 0.70343]
Average AUC ROC: 0.75279
Std Deviation: 0.07537

Model: xGBoost
AUC ROC Scores: [0.68056, 0.76157, 0.75278, 0.77647, 0.71667, 0.6875, 0.77451, 0.88095, 0.63725, 0.63725]
Average AUC ROC: 0.73055
Std Deviation: 0.07076

Model: CatBoost
AUC ROC Scores: [0.68056, 0.86111, 0.77778, 0.82588, 0.70714, 0.71, 0.81127, 0.88571, 0.76225, 0.78676]
Average AUC ROC: 0.78085
Std Deviation: 0.06426

Model: RandomForest
AUC ROC Scores: [0.66319, 0.87384, 0.78889, 0.84706, 0.74762, 0.715, 0.81985, 0.8619, 0.72549, 0.75123]
Average AUC ROC: 0.77941
Std Deviation: 0.06663

Model: KNN
AUC ROC Scores: [0.46528, 0.75926, 0.60694, 0.76, 0.67381, 0.54375, 0.67525, 0.75833, 0.47059, 0.69853]
Average AUC ROC: 0.64117
Std Deviation: 0.10863



### OBSERVATION FOR 10 SPLITS
1. Baseline LightGBM is 0.75279 with Std Dev of 0.07537
2. Baseline xGBoost is 0.73055 with Std Dev of 0.07076
3. Baseline CatBoost is 0.78085 with Std Dev of 0.06426
4. Baseline RandomForest is 0.77941 with Std Dev of 0.06663
5. Baseline KNN is 0.64117 with Std Dev of 0.10863

Run time ~ 40 seconds

In [9]:
rf = RandomForestClassifier(random_state=5)
rf.fit(X_train, y_train)

In [10]:
# Predit the final submission with Random Forest

y_final = rf.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissionrf.csv", index=False)

In [11]:
cat = CatBoostClassifier(silent=True, random_state=5)
cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1d85f7e82e0>

In [12]:
# Predict the final submission with CatBoost

y_final = cat.predict_proba(X_test)[:, 1]
df_test["target"] = y_final.round(1)

df_test[["id", "target"]].to_csv("submissioncat.csv", index=False)