# Imports & Configuration

In [None]:
import catboost as catb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier , Pool
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import early_stopping, log_evaluation

# Data Loading & Basic Preprocessing

In [22]:
data = pd.read_csv('train.csv')
data.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [24]:
test = pd.read_csv('test.csv')

In [25]:
data.head()
data=data.set_index('id')

test = test.set_index('id')

# Categorical Feature Handling

In [26]:
cat_features = [
    "gender", "ethnicity", "education_level",
    "income_level", "smoking_status", "employment_status",
]

In [27]:
X = data.drop(columns='diagnosed_diabetes')
y = data['diagnosed_diabetes'].copy()
y.sum()/len(y)

0.6232957142857143

In [28]:
for col in cat_features:
    X[col] = X[col].astype("category")
    test[col] = test[col].astype("category")
X.dtypes

age                                      int64
alcohol_consumption_per_week             int64
physical_activity_minutes_per_week       int64
diet_score                             float64
sleep_hours_per_day                    float64
screen_time_hours_per_day              float64
bmi                                    float64
waist_to_hip_ratio                     float64
systolic_bp                              int64
diastolic_bp                             int64
heart_rate                               int64
cholesterol_total                        int64
hdl_cholesterol                          int64
ldl_cholesterol                          int64
triglycerides                            int64
gender                                category
ethnicity                             category
education_level                       category
income_level                          category
smoking_status                        category
employment_status                     category
family_histor

In [29]:
X = data.drop(columns='diagnosed_diabetes')
y = data['diagnosed_diabetes'].copy()
y.sum()/len(y)

0.6232957142857143

# CatBoost Hyperparameter Optimization (Optuna)

In [10]:

# Optuna optimization for CatBoost using GPU.
# Objective: maximize ROC-AUC on validation set.


def objective(trial):

    params = {
        "iterations": 5000,
        "depth": trial.suggest_categorical("depth", [4, 5, 6, 7, 8, 9, 10]),
        "learning_rate": trial.suggest_categorical("learning_rate", [ 0.03,0.05,0.07,0.1]),
        "l2_leaf_reg": trial.suggest_categorical("l2_leaf_reg", [18,19, 20,21,22]),
        "bagging_temperature": trial.suggest_categorical("bagging_temperature", [0.1, 0.5, 1.0, 2.0, 5.0]),
        "random_strength": trial.suggest_categorical("random_strength", [ 2,2.5, 3]),
        "border_count": trial.suggest_categorical("border_count", [ 254,400,300]),
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Lossguide"]),
        
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "bootstrap_type": "Bayesian",
        "task_type": "GPU",     # GPU tuning
        "devices": "0",
        "random_seed": 42,
    }

    model = CatBoostClassifier(**params)

    model.fit(
        train_pool,
        eval_set=valid_pool,
        use_best_model=True,
        verbose=500
    )

    preds = model.predict_proba(valid_pool)[:, 1]
    auc = roc_auc_score(y_test, preds)

    return auc



study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=50)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)


# Out-of-Fold Training – CatBoost

In [12]:
# Generate Out-of-Fold (OOF) predictions for CatBoost.
# These predictions will later be used for stacking.
import gc
from sklearn.model_selection import StratifiedKFold , KFold
pred_cat = np.zeros(len(test))
oof_cat = np.zeros(len(X))
k = 5
startK = StratifiedKFold(n_splits=k , shuffle = True , random_state=88)
for fold,(tr_idx , vl_idx)  in enumerate(startK.split(X,y)):
    print('fold ---------------- '+str(fold))
    X_tr_raw = X.iloc[tr_idx].reset_index(drop=True)
    y_tr_raw=y.iloc[tr_idx].reset_index(drop=True)

    X_vl_raw  = X.iloc[vl_idx].reset_index(drop=True)
    y_vl=y.iloc[vl_idx].reset_index(drop=True)
    X_tr , X_vl  = X_tr_raw.copy(), X_vl_raw.copy()
    X_ts = test.copy(deep=True)
    print('creation model ===============')
    model_cat =   CatBoostClassifier(
            cat_features=cat_features,
            iterations=5000,
            learning_rate=0.05,
            grow_policy='Lossguide',
            l2_leaf_reg=21,
            depth=4,
            bagging_temperature=1.0,
            random_strength=3,
            border_count=254,
            loss_function='Logloss',
            eval_metric='AUC',
            bootstrap_type='Bayesian',   
            task_type='GPU',
            devices='0',                
            random_seed=42,
            verbose=False               
        )
    print('=================fitting=========model')
    model_cat.fit(
    X_tr,
    y_tr_raw,
    eval_set=(X_vl, y_vl),
    early_stopping_rounds=400,
    use_best_model=True
    )

    y_vl_pred_cat = model_cat.predict_proba(X_vl)[:,1]
    pred_cat += model_cat.predict_proba(X_ts)[:,1]
    oof_cat[vl_idx] = model_cat.predict_proba(X_vl)[:,1]
    auc = roc_auc_score(y_vl,y_vl_pred_cat)
    print('auc ============='+str(fold) + '==========' +str(auc))
    del  X_tr_raw , y_tr_raw,X_vl_raw 
    # y_vl ,X_tr , X_vl ,X_ts
pred_cat /= k
score_final_cat = roc_auc_score(y , oof_cat)
print("CatBoost OOF AUC:", roc_auc_score(y, oof_cat))


fold ---------------- 0


Default metric period is 5 because AUC is/are not implemented for GPU


fold ---------------- 1


Default metric period is 5 because AUC is/are not implemented for GPU


fold ---------------- 2


Default metric period is 5 because AUC is/are not implemented for GPU


fold ---------------- 3


Default metric period is 5 because AUC is/are not implemented for GPU


fold ---------------- 4


Default metric period is 5 because AUC is/are not implemented for GPU


CatBoost OOF AUC: 0.7288216594177134


# Target Encoding (Leakage-Safe)

In [30]:

# Target encoding with leakage prevention.
# Encoding is done inside CV folds only.

def target_encoder(df_train, df_val, col, target):
    mean = df_train.groupby(col)[target].mean()
    global_mean = df_train[target].mean()
    col_name = f'{col}_mean'
    df_val = df_val.copy()
    df_val[col_name] = df_val[col].map(mean).fillna(global_mean).astype(float)
    return df_val


In [32]:
# """
# High-order categorical interactions.
# These capture complex non-linear relationships.
# """

selected_interactions = [
#     # ===== 2-WAY INTERACTIONS =====
  ['age', 'bmi', 'systolic_bp'],
    ['age', 'bmi', 'cholesterol_total'],
    ['age', 'waist_to_hip_ratio', 'systolic_bp'],

      # ===== 3-WAY INTERACTIONS =====
    ['family_history_diabetes', 'bmi', 'cholesterol_total'],
    ['family_history_diabetes', 'age', 'triglycerides'],
    ['family_history_diabetes', 'waist_to_hip_ratio', 'ldl_cholesterol'],
 # ===== 4-WAY INTERACTIONS =====

 ['family_history_diabetes', 'bmi', 'waist_to_hip_ratio', 'cholesterol_total'],

    ['age', 'bmi', 'cholesterol_total', 'triglycerides'],

    ['bmi', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol'],

    ['waist_to_hip_ratio', 'cholesterol_total', 'triglycerides', 'hdl_cholesterol'],
]


In [33]:
interaction_features =[]
for cols in selected_interactions :
    if len(cols) == 2 :
        name = cols[0] + '__' + cols[1]
        X[name] = X[cols[0]].astype(str) + '__'+X[cols[1]].astype(str)
        test[name] = test[cols[0]].astype(str) + '__'+test[cols[1]].astype(str)
    if len(cols) == 3:
        name = cols[0] + '__' + cols[1]+'__'+cols[2]
        X[name] = X[cols[0]].astype(str) + '__'+X[cols[1]].astype(str)+'__'+X[cols[2]].astype(str)
        test[name] = test[cols[0]].astype(str) + '__'+test[cols[1]].astype(str)+'__'+test[cols[2]].astype(str)
    if len(cols) == 4 :
        name = cols[0] + '__' + cols[1]+'__'+cols[2]+'__'+cols[3]
        X[name] = X[cols[0]].astype(str) + '__'+X[cols[1]].astype(str)+'__'+X[cols[2]].astype(str)+'__'+X[cols[3]].astype(str)
        test[name] = test[cols[0]].astype(str) + '__'+test[cols[1]].astype(str)+'__'+test[cols[2]].astype(str)+'__'+test[cols[3]].astype(str)

    interaction_features.append(name)
    

X.head(3)

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,family_history_diabetes__ldl_cholesterol__triglycerides,family_history_diabetes__diet_score__triglycerides,age__physical_activity_minutes_per_week__waist_to_hip_ratio,age__physical_activity_minutes_per_week__triglycerides,age__hdl_cholesterol__triglycerides,age__ldl_cholesterol__triglycerides,bmi__physical_activity_minutes_per_week__triglycerides,bmi__diet_score__triglycerides,waist_to_hip_ratio__hdl_cholesterol__triglycerides,family_history_diabetes__age__triglycerides__hdl_cholesterol
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,0__114__102,0__7.7__102,31__45__0.93,31__45__102,31__58__102,31__114__102,33.4__45__102,33.4__7.7__102,0.93__58__102,0__31__102__58
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,0__121__124,0__5.7__124,50__73__0.83,50__73__124,50__50__124,50__121__124,23.8__73__124,23.8__5.7__124,0.83__50__124,0__50__124__50
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,0__114__108,0__8.5__108,32__158__0.83,32__158__108,32__59__108,32__114__108,24.1__158__108,24.1__8.5__108,0.83__59__108,0__32__108__59


# XGBoost with Target-Encoding Interactions

In [44]:

# 
# XGBoost trained on numerical + target-encoded interaction features.
# GPU acceleration + early stopping.
 # Optimized with Optuna.

pred_xgb = np.zeros(len(test))
oof_xgb = np.zeros(len(X))
k = 5
startK = StratifiedKFold(n_splits=k , shuffle = True , random_state=88)
for fold,(tr_idx , vl_idx)  in enumerate(startK.split(X,y)):
    print('fold ---------------- '+str(fold))
    X_tr_raw = X.iloc[tr_idx].reset_index(drop=True)
    y_tr_raw=y.iloc[tr_idx].reset_index(drop=True)

    X_vl_raw  = X.iloc[vl_idx].reset_index(drop=True)
    y_vl=y.iloc[vl_idx].reset_index(drop=True)
    X_tr , X_vl  = X_tr_raw.copy(), X_vl_raw.copy()
    X_ts = test.copy(deep=True)
    for col in cat_features:
        X_tr[col] = X_tr[col].astype("category")
        X_vl[col] = X_vl[col].astype("category")
        X_ts[col] = X_ts[col].astype("category")

    
    for col in interaction_features:
        X_tr[f'{col}_mean'] = np.nan
        X_vl[f'{col}_mean'] = np.nan
        X_ts[f'{col}_mean'] = np.nan


    inner_startkfold = KFold(n_splits = k , shuffle=True , random_state=88)
    

    for _ ,(tr_idx_in,vl_idx_in) in enumerate(inner_startkfold.split(X_tr_raw)):
        in_tr = pd.concat([X_tr_raw.iloc[tr_idx_in],y_tr_raw.iloc[tr_idx_in]],axis=1)
        in_vl = X_tr_raw.iloc[vl_idx_in]
        for col in interaction_features :
           

            
            te_temp = target_encoder(in_tr, in_vl.copy(),col,'diagnosed_diabetes')
            te_col = f'{col}_mean'
            
            X_tr.loc[vl_idx_in,te_col] = te_temp[te_col].values
    assert not X_tr[[f'{c}_mean' for c in interaction_features]].isnull().any().any(), \
    "NaN detected in X_tr after OOF target encoding"
    tr_with_y = pd.concat([X_tr_raw, y_tr_raw], axis=1)

    for col in interaction_features :
       
        
        te_col=f'{col}_mean'
        tmp = target_encoder(tr_with_y, X_vl[[col]].copy(), col, 'diagnosed_diabetes')
        X_vl[f'{col}_mean'] = tmp[f'{col}_mean'].values

        tmp = target_encoder(tr_with_y, X_ts[[col]].copy(), col, 'diagnosed_diabetes')
        X_ts[f'{col}_mean'] = tmp[f'{col}_mean'].values

    X_tr.drop(interaction_features ,axis = 1 , inplace =True)
    X_vl.drop(interaction_features ,axis = 1 , inplace =True)
    X_ts.drop(interaction_features ,axis = 1 , inplace =True)
    print('=================fitting=========model')
    xgb_int = xgb.XGBClassifier(
            n_estimators = 6000,
            max_depth=5,
            min_child_weight=2,
            gamma=0,
            reg_alpha=5,
            reg_lambda=10,
            learning_rate=0.02,
            subsample=0.9,
            colsample_bytree=0.5,
            grow_policy='lossguide',
            booster='gbtree',
            tree_method='gpu_hist',      
            predictor='gpu_predictor',
            enable_categorical=True,     
            objective='binary:logistic',
            eval_metric='auc',
            random_state=42,
            early_stopping_rounds=400
    
        )
    xgb_int.fit(X_tr,y_tr_raw , eval_set = [(X_vl,y_vl)],verbose=False  )
    y_vl_pred = xgb_int.predict_proba(X_vl)[:,1]
    pred_xgb += xgb_int.predict_proba(X_ts)[:,1]
    oof_xgb[vl_idx] = xgb_int.predict_proba(X_vl)[:,1]
    auc = roc_auc_score(y_vl,y_vl_pred)
    print('auc ============='+str(fold) + '==========' +str(auc))
    del  X_tr_raw ,X_vl_raw 
pred_xgb /= k
score_final_xgb = roc_auc_score(y , oof_xgb)


fold ---------------- 0
fold ---------------- 1
fold ---------------- 2
fold ---------------- 3
fold ---------------- 4


In [39]:
X_tr.dtypes[:50]

age                                                                   int64
alcohol_consumption_per_week                                          int64
physical_activity_minutes_per_week                                    int64
diet_score                                                          float64
sleep_hours_per_day                                                 float64
screen_time_hours_per_day                                           float64
bmi                                                                 float64
waist_to_hip_ratio                                                  float64
systolic_bp                                                           int64
diastolic_bp                                                          int64
heart_rate                                                            int64
cholesterol_total                                                     int64
hdl_cholesterol                                                       int64
ldl_choleste

# LightGBM Model

In [53]:

# LightGBM model trained on interaction-enhanced dataset.
# Optimized with Optuna.

pred_lightgbm = np.zeros(len(test))
oof_lightgbm = np.zeros(len(X))
k = 5
startK = StratifiedKFold(n_splits=k , shuffle = True , random_state=88)
for fold,(tr_idx , vl_idx)  in enumerate(startK.split(X,y)):
    print('fold ---------------- '+str(fold))
    X_tr_raw = X.iloc[tr_idx].reset_index(drop=True)
    y_tr_raw=y.iloc[tr_idx].reset_index(drop=True)

    X_vl_raw  = X.iloc[vl_idx].reset_index(drop=True)
    y_vl=y.iloc[vl_idx].reset_index(drop=True)
    X_tr , X_vl  = X_tr_raw.copy(), X_vl_raw.copy()
    X_ts = test.copy(deep=True)
    for col in cat_features:
        X_tr[col] = X_tr[col].astype("category")
        X_vl[col] = X_vl[col].astype("category")
        X_ts[col] = X_ts[col].astype("category")

    for col in interaction_features:
        X_tr[f'{col}_mean'] = np.nan
        X_vl[f'{col}_mean'] = np.nan
        X_ts[f'{col}_mean'] = np.nan


    inner_startkfold = KFold(n_splits = k , shuffle=True , random_state=88)
    

    for _ ,(tr_idx_in,vl_idx_in) in enumerate(inner_startkfold.split(X_tr_raw)):
        in_tr = pd.concat([X_tr_raw.iloc[tr_idx_in],y_tr_raw.iloc[tr_idx_in]],axis=1)
        in_vl = X_tr_raw.iloc[vl_idx_in]
        for col in interaction_features :
           

            
            te_temp = target_encoder(in_tr, in_vl.copy(),col,'diagnosed_diabetes')
            te_col = f'{col}_mean'
            
            X_tr.loc[vl_idx_in,te_col] = te_temp[te_col].values
    assert not X_tr[[f'{c}_mean' for c in interaction_features]].isnull().any().any(), \
    "NaN detected in X_tr after OOF target encoding"
    tr_with_y = pd.concat([X_tr_raw, y_tr_raw], axis=1)

    for col in interaction_features :
       
        
        te_col=f'{col}_mean'
        tmp = target_encoder(tr_with_y, X_vl[[col]].copy(), col, 'diagnosed_diabetes')
        X_vl[f'{col}_mean'] = tmp[f'{col}_mean'].values

        tmp = target_encoder(tr_with_y, X_ts[[col]].copy(), col, 'diagnosed_diabetes')
        X_ts[f'{col}_mean'] = tmp[f'{col}_mean'].values

    X_tr.drop(interaction_features ,axis = 1 , inplace =True)
    X_vl.drop(interaction_features ,axis = 1 , inplace =True)
    X_ts.drop(interaction_features ,axis = 1 , inplace =True)
    print('creation model ===============')
    model_light = LGBMClassifier( learning_rate = 0.005 , num_leaves =  64 , max_depth =  6 , min_child_samples= 200 , min_gain_to_split = 1.43 ,
                       colsample_bytree = 0.447 , reg_alpha = 1.5829 , reg_lambda = 65.66 , max_bin=98 ,
                       objective = 'binary', n_estimators = 10000  ,n_jobs = -1,
                        random_state = 42 ,    verbosity= -1)
    print('=================fitting=========model')
    model_light.fit(X_tr,y_tr_raw , eval_set = [(X_vl,y_vl)] , callbacks=[
        early_stopping(stopping_rounds=400),
        log_evaluation(period=0)  
    ] , eval_metric="auc",)
    y_vl_pred = model_light.predict_proba(X_vl)[:,1]
    pred_lightgbm += model_light.predict_proba(X_ts)[:,1]
    oof_lightgbm[vl_idx] = model_light.predict_proba(X_vl)[:,1]
    auc = roc_auc_score(y_vl,y_vl_pred)
    print('auc ============='+str(fold) + '==========' +str(auc))
    del  X_tr_raw , y_tr_raw,X_vl_raw 
    # y_vl ,X_tr , X_vl ,X_ts
pred_lightgbm /= k
score_final = roc_auc_score(y , oof_lightgbm)
X_tr.head(3)


fold ---------------- 0
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[6334]	valid_0's auc: 0.728344	valid_0's binary_logloss: 0.581127
fold ---------------- 1
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[4877]	valid_0's auc: 0.730891	valid_0's binary_logloss: 0.579493
fold ---------------- 2
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[4778]	valid_0's auc: 0.731956	valid_0's binary_logloss: 0.579095
fold ---------------- 3
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[6672]	valid_0's auc: 0.730095	valid_0's binary_logloss: 0.580098
fold ---------------- 4
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[5840]	valid_0's auc: 0.728864	valid_0's binary_logloss: 0.581312


Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,family_history_diabetes__ldl_cholesterol__triglycerides_mean,family_history_diabetes__diet_score__triglycerides_mean,age__physical_activity_minutes_per_week__waist_to_hip_ratio_mean,age__physical_activity_minutes_per_week__triglycerides_mean,age__hdl_cholesterol__triglycerides_mean,age__ldl_cholesterol__triglycerides_mean,bmi__physical_activity_minutes_per_week__triglycerides_mean,bmi__diet_score__triglycerides_mean,waist_to_hip_ratio__hdl_cholesterol__triglycerides_mean,family_history_diabetes__age__triglycerides__hdl_cholesterol_mean
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,0.710145,0.623377,0.623917,0.0,0.666667,0.623917,0.623917,0.623917,1.0,0.5
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,0.645833,0.555556,0.5,1.0,0.5,1.0,0.0,1.0,0.818182,0.5
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,0.619469,0.392857,0.5,0.622795,0.0,0.622795,0.622795,1.0,0.5,0.0


In [54]:

# Check correlation between base models.
# High correlation -> diminishing returns for stacking.


corr = np.corrcoef(oof_cat, oof_xgb)[0,1]
print("OOF correlation:", corr)


OOF correlation: 0.9732682150389157


# Stacking (Final Ensemble)

In [55]:

from sklearn.linear_model import LogisticRegressionCV


# Final stacking using Logistic Regression.
# Optimized directly for ROC-AUC.


dataset_stack = np.column_stack((oof_cat, oof_lightgbm, oof_xgb))

stacker = LogisticRegressionCV(cv=5, scoring="roc_auc")
stacker.fit(dataset_stack, y)

final_oof = stacker.predict_proba(dataset_stack)[:,1]
print("Stacked OOF AUC:", roc_auc_score(y, final_oof))


Stacked OOF AUC: 0.730941176121932


# Submission Generation

In [56]:
test_stack = np.column_stack((pred_cat, pred_lightgbm, pred_xgb))
final_pred = stacker.predict_proba(test_stack)[:,1]

submission = pd.DataFrame({
    "id": test.index,
    "diagnosed_diabetes": final_pred
})

submission.to_csv("submission.csv", index=False)
