In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [2]:
# Load the dataset

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
# Split the training data in train and validation data, maintaining the distribution of the bins

train_train, train_val = train_test_split(df_train, test_size=0.2, random_state=5, stratify=df_train["target"])

In [4]:
# Save the train_train and train_val data to local memory 

train_train.to_csv("validation/train.csv", index=False)
train_val.to_csv("validation/val.csv", index=False)

In [5]:
# Load the train and test datasets

train_path = "validation/train.csv"
test_path = "validation/val.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,278,1.013,6.19,577,27.5,620,2.45,1
1,121,1.021,5.53,775,29.0,302,3.34,0
2,46,1.02,5.16,702,19.7,199,1.38,0
3,166,1.007,4.9,680,25.3,282,0.17,0
4,139,1.018,7.38,704,23.6,270,1.27,0


In [6]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "target"], axis=1)
y_train = df_train["target"]

X_test = df_test.drop(["id", "target"], axis=1)
y_test = df_test["target"]

In [7]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMClassifier(random_state=5),
    "xGBoost": xgb.XGBClassifier(random_state=5),
    "CatBoost": CatBoostClassifier(silent=True, random_state=5),
    "RandomForest": RandomForestClassifier(random_state=5),
    "KNN": KNeighborsClassifier(),
}

In [8]:
for name, model in models.items():
    model.fit(X_train, y_train)

    # Predict probabilities for the test (keep only the probability of the positive class)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate the AUC-ROC score
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    print(f"Model: {name}")
    print(f"AUC ROC: {auc_roc:.5f}")
    print()

Model: LightGBM
AUC ROC: 0.73972

Model: xGBoost
AUC ROC: 0.72209

Model: CatBoost
AUC ROC: 0.77086

Model: RandomForest
AUC ROC: 0.75705

Model: KNN
AUC ROC: 0.65335



### OBSERVATION
1. Baseline LightGBM is 0.73972
2. Baseline xGBoost is 0.72209
3. Baseline CatBoost is 0.77086
4. Baseline RandomForest is 0.75705
5. Baseline KNN is 0.65335

Run time ~ 2 seconds