In [1]:
# Install required packages (run this in Kaggle notebook with internet enabled)
!pip install -q xgboost lightgbm catboost

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder # Ensure you have category_encoders installed
import warnings

warnings.filterwarnings('ignore')

# --- Configuration ---
SEED = 42
N_SPLITS = 10
TARGET = 'diagnosed_diabetes'

train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')

# Pre-drop and save IDs
test_ids = test['id']
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

# --- 1. Robust Mapping ---
def clean_and_map(df):
    df = df.copy()
    # Ordinal mappings for risk quantification
    smoke_map = {'Never': 0, 'Former': 1, 'Current': 2}
    gender_map = {'Female': 0, 'Male': 1}
    
    df['smoking_status'] = df['smoking_status'].map(smoke_map).fillna(0)
    df['gender'] = df['gender'].map(gender_map).fillna(0)
    
    # Cast history columns to int to ensure numeric treatment
    history_cols = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
    for col in history_cols:
        df[col] = df[col].astype(int)
        
    return df

train = clean_and_map(train)
test = clean_and_map(test)

# --- 2. Metabolic Feature Engineering ---
def engineer_metabolic_features(df):
    df = df.copy()
    
    # Cardiovascular Indicators
    df['MAP'] = (df['systolic_bp'] + 2 * df['diastolic_bp']) / 3 # Mean Arterial Pressure
    df['Pulse_Pressure'] = df['systolic_bp'] - df['diastolic_bp']
    
    # Lipid Interactions (The Castelli Risk Index)
    df['Non_HDL'] = df['cholesterol_total'] - df['hdl_cholesterol']
    df['Total_HDL_Ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)
    df['TG_HDL_Ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1e-5)
    
    # Obesity Clusters (Synergy of BMI and Waist-to-Hip)
    df['Metabolic_Syndrome_Index'] = df['bmi'] * df['waist_to_hip_ratio']
    
    # Lifestyle Score (Activity minus Sedentary)
    # Convert activity to hrs/day to match screen time
    daily_activity_hrs = (df['physical_activity_minutes_per_week'] / 7) / 60
    df['Active_Balance'] = daily_activity_hrs - df['screen_time_hours_per_day']
    
    return df

train = engineer_metabolic_features(train)
test = engineer_metabolic_features(test)

y = train[TARGET]
X = train.drop(columns=[TARGET])
X_test = test

# --- 3. Advanced Encoding Logic ---
# Identifying remaining strings (Ethnicity, Education, Income, Employment)
nominal_cols = X.select_dtypes(include=['object']).columns.tolist()

# Use Target Encoding for nominal columns to capture subgroup risks
# (Handled inside the cross-validation loop to prevent data leakage)

# --- 4. CV Strategy with Target Encoding ---
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

print("Starting Cross-Validation with Metabolic Features...")

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    
    # Target Encode nominal features inside the fold
    te = TargetEncoder(cols=nominal_cols)
    X_trn_encoded = te.fit_transform(X_trn, y_trn)
    X_val_encoded = te.transform(X_val)
    X_test_fold = te.transform(X_test)
    
    # --- XGBoost Model ---
    model_xgb = xgb.XGBClassifier(
        n_estimators=3500,
        learning_rate=0.008, # Extremely conservative to squeeze accuracy
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.4,
        reg_lambda=2.0,
        random_state=SEED + fold,
        tree_method='hist',
        early_stopping_rounds=200
    )
    model_xgb.fit(X_trn_encoded, y_trn, eval_set=[(X_val_encoded, y_val)], verbose=False)
    
    # --- CatBoost Model ---
    model_cb = cb.CatBoostClassifier(
        iterations=3500,
        learning_rate=0.01,
        depth=5,
        l2_leaf_reg=5,
        auto_class_weights='Balanced',
        random_seed=SEED + fold,
        verbose=False,
        early_stopping_rounds=200
    )
    # Fit CB on raw categorical strings (it outperforms TE on nominal data)
    model_cb.fit(X_trn, y_trn, cat_features=nominal_cols, eval_set=(X_val, y_val))
    
    # Blend predictions (0.6 XGB / 0.4 CatBoost)
    fold_p_xgb = model_xgb.predict_proba(X_val_encoded)[:, 1]
    fold_p_cb = model_cb.predict_proba(X_val)[:, 1]
    
    oof_preds[val_idx] = (0.6 * fold_p_xgb) + (0.4 * fold_p_cb)
    
    # Test predictions
    test_preds += ((0.6 * model_xgb.predict_proba(X_test_fold)[:, 1]) + 
                   (0.4 * model_cb.predict_proba(X_test)[:, 1])) / N_SPLITS
    
    print(f"Fold {fold+1} AUC: {roc_auc_score(y_val, oof_preds[val_idx]):.5f}")

print(f"\nOverall OOF AUC: {roc_auc_score(y, oof_preds):.5f}")

# Submit
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_preds})
submission.to_csv('submission.csv', index=False)
print("Submission file optimized for metabolism metrics saved.")

Starting Cross-Validation with Metabolic Features...
Fold 1 AUC: 0.72402
Fold 2 AUC: 0.72532
Fold 3 AUC: 0.72272
Fold 4 AUC: 0.72219
Fold 5 AUC: 0.72369
Fold 6 AUC: 0.72331
Fold 7 AUC: 0.72235
Fold 8 AUC: 0.72681
Fold 9 AUC: 0.72616
Fold 10 AUC: 0.72266

Overall OOF AUC: 0.72392
Submission file optimized for metabolism metrics saved.
