In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings
import mlflow
print(mlflow.get_tracking_uri())
import os

# Optional: Set MLflow tracking URI (e.g., local or remote)
# mlflow.set_tracking_uri("http://localhost:5000")

# Enable autologging for all supported libraries
mlflow.lightgbm.autolog(log_models=False, silent=True)
mlflow.xgboost.autolog(log_models=False, silent=True)

warnings.filterwarnings('ignore')

# Load data - ADJUST PATHS TO YOUR LOCAL FILES
train = pd.read_csv('diabetes/train.csv')
test = pd.read_csv('diabetes/test.csv')
submission = pd.read_csv('diabetes/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:

# Find target column
target_col = [col for col in train.columns if col not in test.columns and col != 'id'][0]
print(f"Target column: {target_col}")

# Separate features and target
X = train.drop(['id', target_col], axis=1)
y = train[target_col]
X_test = test.drop(['id'], axis=1)

print(f"\nFeatures shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

In [None]:

# Feature Engineering
def create_features(df):
    df = df.copy()
    df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 35, 100], labels=[0, 1, 2, 3, 4])
    df['bmi_category'] = df['bmi_category'].astype(int)
    df['bp_mean'] = (df['systolic_bp'] + df['diastolic_bp']) / 2
    df['bp_pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['hypertension'] = ((df['systolic_bp'] >= 140) | (df['diastolic_bp'] >= 90)).astype(int)
    df['cholesterol_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1)
    df['triglycerides_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1)
    df['non_hdl_cholesterol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    df['metabolic_score'] = (
        (df['bmi'] >= 30).astype(int) +
        (df['systolic_bp'] >= 130).astype(int) +
        (df['triglycerides'] >= 150).astype(int) +
        (df['hdl_cholesterol'] < 40).astype(int)
    )
    df['activity_low'] = (df['physical_activity_minutes_per_week'] < 150).astype(int)
    df['sleep_poor'] = (df['sleep_hours_per_day'] < 7).astype(int)
    df['screen_high'] = (df['screen_time_hours_per_day'] > 4).astype(int)
    df['alcohol_high'] = (df['alcohol_consumption_per_week'] > 7).astype(int)
    df['age_bmi'] = df['age'] * df['bmi']
    df['age_systolic'] = df['age'] * df['systolic_bp']
    df['age_cholesterol'] = df['age'] * df['cholesterol_total']
    df['activity_per_day'] = df['physical_activity_minutes_per_week'] / 7
    df['sedentary_score'] = df['screen_time_hours_per_day'] / (df['physical_activity_minutes_per_week'] / 60 + 1)
    df['health_score'] = (
        df['diet_score'] * 2 +
        (df['sleep_hours_per_day'] >= 7).astype(int) * 2 +
        (df['physical_activity_minutes_per_week'] >= 150).astype(int) * 3 +
        (df['alcohol_consumption_per_week'] <= 7).astype(int)
    )
    df['whr_value'] = df['waist_to_hip_ratio']
    df['heart_risk_score'] = df['heart_rate'] / 100
    df['family_and_bmi'] = df['family_history_diabetes'] * df['bmi']
    df['family_and_age'] = df['family_history_diabetes'] * df['age']
    return df

In [None]:

print("\nCreating features...")
X = create_features(X)
X_test = create_features(X_test)

# Encode categorical variables
categorical_cols = ['gender', 'ethnicity', 'education_level', 'income_level',
                    'smoking_status', 'employment_status']

le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    le_dict[col] = le

# Post-encoding features
X['whr_high_risk'] = ((X['gender'] == le_dict['gender'].transform(['Male'])[0]) & (X['whr_value'] > 0.90)) | \
                     ((X['gender'] == le_dict['gender'].transform(['Female'])[0]) & (X['whr_value'] > 0.85))
X['whr_high_risk'] = X['whr_high_risk'].astype(int)

X_test['whr_high_risk'] = ((X_test['gender'] == le_dict['gender'].transform(['Male'])[0]) & (X_test['whr_value'] > 0.90)) | \
                          ((X_test['gender'] == le_dict['gender'].transform(['Female'])[0]) & (X_test['whr_value'] > 0.85))
X_test['whr_high_risk'] = X_test['whr_high_risk'].astype(int)

X['cardiovascular_risk'] = (
    X['cardiovascular_history'] * 3 +
    X['hypertension_history'] * 2 +
    (X['systolic_bp'] >= 140).astype(int) +
    (X['diastolic_bp'] >= 90).astype(int)
)

X_test['cardiovascular_risk'] = (
    X_test['cardiovascular_history'] * 3 +
    X_test['hypertension_history'] * 2 +
    (X_test['systolic_bp'] >= 140).astype(int) +
    (X_test['diastolic_bp'] >= 90).astype(int)
)

smoking_current = le_dict['smoking_status'].transform(['current'])[0] if 'current' in le_dict['smoking_status'].classes_ else -1
X['lifestyle_risk'] = (
    X['alcohol_high'] +
    X['activity_low'] +
    X['sleep_poor'] +
    X['screen_high'] +
    (X['smoking_status'] == smoking_current).astype(int)
)

X_test['lifestyle_risk'] = (
    X_test['alcohol_high'] +
    X_test['activity_low'] +
    X_test['sleep_poor'] +
    X_test['screen_high'] +
    (X_test['smoking_status'] == smoking_current).astype(int)
)

smoking_never = le_dict['smoking_status'].transform(['never'])[0] if 'never' in le_dict['smoking_status'].classes_ else -1
X['health_score'] += (X['smoking_status'] == smoking_never).astype(int) * 2
X_test['health_score'] += (X_test['smoking_status'] == smoking_never).astype(int) * 2

print(f"Final feature shape: {X.shape}")

In [None]:

# Cross-validation setup
N_FOLDS = 10
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Model parameters
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'learning_rate': 0.02,
    'num_leaves': 31,
    'max_depth': 8,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.02,
    'max_depth': 7,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'random_state': 42,
    'tree_method': 'hist'  # Uses CPU; 'gpu_hist' if NVIDIA GPU available
}

cat_params = {
    'iterations': 3000,
    'learning_rate': 0.02,
    'depth': 8,
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'verbose': 0,
    'task_type': 'CPU',  # Change to 'GPU' if NVIDIA GPU and CatBoost GPU support enabled
    'eval_metric': 'AUC'
}

In [None]:

# Training with cross-validation inside MLflow run
with mlflow.start_run(run_name="Diabetes_Ensemble_GBDT"):
    lgb_oof = np.zeros(len(X))
    xgb_oof = np.zeros(len(X))
    cat_oof = np.zeros(len(X))

    lgb_preds = np.zeros(len(X_test))
    xgb_preds = np.zeros(len(X_test))
    cat_preds = np.zeros(len(X_test))

    print("\n" + "=" * 60)
    print("Starting Cross-Validation Training")
    print("=" * 60)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n{'=' * 60}")
        print(f"Fold {fold + 1}/{N_FOLDS}")
        print(f"{'=' * 60}")

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # LightGBM
        print("Training LightGBM...")
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        lgb_model = lgb.train(
            lgb_params,
            lgb_train,
            num_boost_round=3000,
            valid_sets=[lgb_val],
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(500)]
        )

        lgb_oof[val_idx] = lgb_model.predict(X_val)
        lgb_preds += lgb_model.predict(X_test) / N_FOLDS

        # XGBoost
        print("\nTraining XGBoost...")
        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_val = xgb.DMatrix(X_val, label=y_val)

        xgb_model = xgb.train(
            xgb_params,
            xgb_train,
            num_boost_round=3000,
            evals=[(xgb_val, 'eval')],
            early_stopping_rounds=100,
            verbose_eval=500
        )

        xgb_oof[val_idx] = xgb_model.predict(xgb.DMatrix(X_val))
        xgb_preds += xgb_model.predict(xgb.DMatrix(X_test)) / N_FOLDS

        # CatBoost
        print("\nTraining CatBoost...")
        cat_model = CatBoostClassifier(**cat_params)
        cat_model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            early_stopping_rounds=100,
            verbose=500
        )

        cat_oof[val_idx] = cat_model.predict_proba(X_val)[:, 1]
        cat_preds += cat_model.predict_proba(X_test)[:, 1] / N_FOLDS

    # Final scores
    lgb_cv_score = roc_auc_score(y, lgb_oof)
    xgb_cv_score = roc_auc_score(y, xgb_oof)
    cat_cv_score = roc_auc_score(y, cat_oof)

    mlflow.log_metrics({
        "lgb_cv_auc": lgb_cv_score,
        "xgb_cv_auc": xgb_cv_score,
        "cat_cv_auc": cat_cv_score
    })

    weights = np.array([lgb_cv_count := lgb_cv_score, xgb_cv_score, cat_cv_score])
    weights = weights / weights.sum()

    final_preds = (weights[0] * lgb_preds +
                   weights[1] * xgb_preds +
                   weights[2] * cat_preds)

    ensemble_cv = roc_auc_score(y,
                                weights[0] * lgb_oof +
                                weights[1] * xgb_oof +
                                weights[2] * cat_oof)

    mlflow.log_metric("ensemble_cv_auc", ensemble_cv)
    print(f"\nðŸŽ¯ Ensemble CV AUC: {ensemble_cv:.6f}")

    # Save submission
    submission[target_col] = final_preds
    submission.to_csv('submission.csv', index=False)

    # Log submission as artifact
    mlflow.log_artifact("submission.csv")

    print("\n" + "=" * 60)
    print("âœ… SUBMISSION FILE CREATED & LOGGED TO MLFLOW!")
    print("=" * 60)

