In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

train = pd.read_csv("data/diabetic/train.csv")
test  = pd.read_csv("data/diabetic/test.csv")

X = train.drop("diagnosed_diabetes", axis=1)
y = train["diagnosed_diabetes"]

In [3]:
# --- Simple categorical encoding ---
cat_cols = X.select_dtypes(include="object").columns.tolist()
for col in cat_cols:
    X[col] = X[col].astype("category").cat.codes
    test[col] = test[col].astype("category").cat.codes

In [4]:
# --- CV Setup ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
pred_lgb = np.zeros(len(test))
pred_xgb = np.zeros(len(test))

In [5]:
# --- LightGBM ---
lgb_params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.03,
    num_leaves=63,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    n_estimators=2000,
)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_tr, y_tr)

    oof_lgb[val_idx] = model.predict_proba(X_val)[:,1]
    pred_lgb += model.predict_proba(test)[:,1] / 10

print("LGB OOF AUC:", roc_auc_score(y, oof_lgb))

[LightGBM] [Info] Number of positive: 392676, number of negative: 237324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1891
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503559
[LightGBM] [Info] Start training from score 0.503559
[LightGBM] [Info] Number of positive: 392676, number of negative: 237324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1896
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 25
[LightGBM] [In

In [6]:
# --- XGBoost ---
xgb_model = xgb.XGBClassifier(
    n_estimators=2000,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    random_state=42
)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    xgb_model.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:,1]
    pred_xgb += xgb_model.predict_proba(test)[:,1] / 10

print("XGB OOF AUC:", roc_auc_score(y, oof_xgb))

XGB OOF AUC: 0.7271229489358009


In [None]:
test_pred = 0.5 * pred_lgb + 0.5 * pred_xgb

submission = pd.DataFrame({
    "id": test["id"],
    "diagnosed_diabetes": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")