In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight


import joblib
import os


In [3]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier


In [4]:
df = pd.read_csv("data/data_clean.csv")

In [6]:
TARGET = "default.payment.next.month"

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [7]:
print(X.shape, y.value_counts())

(30000, 35) default.payment.next.month
0    23364
1     6636
Name: count, dtype: int64


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
X_train.shape

(24000, 35)

In [11]:
X_val.shape

(6000, 35)

In [12]:
y_train.value_counts(normalize=True)

default.payment.next.month
0    0.778792
1    0.221208
Name: proportion, dtype: float64

In [13]:
y_val.value_counts(normalize=True)

default.payment.next.month
0    0.778833
1    0.221167
Name: proportion, dtype: float64

In [14]:
classes = np.unique(y_train)

In [15]:
weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

In [16]:
class_weights = dict(zip(classes, weights))

In [17]:
class_weights 

{np.int64(0): np.float64(0.6420202236370446),
 np.int64(1): np.float64(2.2603126765869277)}

In [18]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weights,
    random_state=42
)

lgb_model.fit(X_train, y_train)

lgb_pred = lgb_model.predict_proba(X_val)[:, 1]
lgb_auc = roc_auc_score(y_val, lgb_pred)

print("LightGBM ROC-AUC:", lgb_auc)

[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6009
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM ROC-AUC: 0.770880546279828


In [19]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="auc",
    random_state=42
)

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
xgb_auc = roc_auc_score(y_val, xgb_pred)

print("XGBoost ROC-AUC:", xgb_auc)


XGBoost ROC-AUC: 0.7707481497954144


In [20]:
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=class_weights,
    verbose=0,
    random_state=42
)

cat_model.fit(X_train, y_train)

cat_pred = cat_model.predict_proba(X_val)[:, 1]
cat_auc = roc_auc_score(y_val, cat_pred)

print("CatBoost ROC-AUC:", cat_auc)


CatBoost ROC-AUC: 0.7739492097413496


In [21]:
os.makedirs("models", exist_ok=True)

joblib.dump(lgb_model, "models/lightgbm_model.pkl")
joblib.dump(xgb_model, "models/xgboost_model.pkl")
joblib.dump(cat_model, "models/catboost_model.pkl")

['models/catboost_model.pkl']

In [30]:
stacking_df = pd.DataFrame({
    "lgb_pred": lgb_pred,
    "xgb_pred": xgb_pred,
    "cat_pred": cat_pred,
    "target": y_val.values
})


stacking_df.to_csv("data/outputs/stacking_level1.csv", index=False)

In [31]:
print("=== MODEL PERFORMANCE ===")
print(f"LightGBM AUC : {lgb_auc:.4f}")
print(f"XGBoost AUC  : {xgb_auc:.4f}")
print(f"CatBoost AUC : {cat_auc:.4f}")

=== MODEL PERFORMANCE ===
LightGBM AUC : 0.7709
XGBoost AUC  : 0.7707
CatBoost AUC : 0.7739
