In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# -------------------- Load Data --------------------
df = pd.read_csv("D:\\jilo_new.csv", encoding='latin1')
df.drop(columns=[col for col in df.columns if 'id' in col.lower()], inplace=True)

# -------------------- Gender Column Processing --------------------
gender_col = [col for col in df.columns if 'gender' in col.lower()]
if gender_col:
    gender_col = gender_col[0]
    df[gender_col] = df[gender_col].astype(str).str.lower().str.strip()
    df[gender_col] = df[gender_col].apply(lambda x: 1 if x in ['male', 'm'] else (0 if x in ['female', 'f'] else np.nan))

# -------------------- Target Cleaning --------------------
target_cols = ['Overall Risk', 'Cardiac Risk', 'Diabetic Risk', 'Hypertension Risk']
df = df.dropna(subset=target_cols)
for col in target_cols:
    df = df[~df[col].astype(str).str.lower().isin(['unknown', 'not assessed', 'na', ''])]

# -------------------- Fill Missing with Mode --------------------
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode(dropna=True)[0])

# -------------------- Preprocessing --------------------
X_raw = df.drop(columns=target_cols)
categorical_cols = X_raw.select_dtypes(include='object').columns.tolist()
numeric_cols = X_raw.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

# -------------------- Models --------------------
base_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=150, max_depth=15, class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', max_depth=6),
    "SVC": SVC(probability=True, kernel='rbf', class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=150),
    "CatBoost": CatBoostClassifier(verbose=0),
    "LightGBM": lgb.LGBMClassifier()
}

# -------------------- Evaluation + Feature Importance --------------------
results = {}

for target in target_cols:
    print(f"\n=== Evaluating: {target} ===")
    y = df[target].astype('category').cat.codes
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)

    X_train_prep = preprocessor.fit_transform(X_train)
    X_test_prep = preprocessor.transform(X_test)
    feature_names = preprocessor.get_feature_names_out()

    results[target] = {}

    for name, model_base in base_models.items():
        model = OneVsRestClassifier(model_base)
        model.fit(X_train_prep, y_train)
        y_pred = model.predict(X_test_prep)

        try:
            y_proba = model.predict_proba(X_test_prep)
            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
        except:
            auc = np.nan

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        results[target][name] = {
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1,
            "ROC AUC": auc
        }

        # ----------- EXPLANATORY PART: Top Features --------------
        if hasattr(model, 'estimators_'):
            estimator = model.estimators_[0]  # Pick one class for interpretability

            if hasattr(estimator, "feature_importances_"):
                importances = estimator.feature_importances_
                top_indices = np.argsort(importances)[-10:]
                print(f"\n>> Top Features for {name} ({target})")
                for i in reversed(top_indices):
                    print(f"{feature_names[i]}: {importances[i]:.4f}")

            elif name == "XGBoost":
                try:
                    booster = estimator.get_booster()
                    score_dict = booster.get_score(importance_type='gain')
                    top = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)[:10]
                    print(f"\n>> Top Features for XGBoost ({target})")
                    for feat, val in top:
                        print(f"{feat}: {val:.4f}")
                except:
                    pass

            elif name == "LightGBM":
                try:
                    importances = estimator.feature_importances_
                    top_indices = np.argsort(importances)[-10:]
                    print(f"\n>> Top Features for LightGBM ({target})")
                    for i in reversed(top_indices):
                        print(f"{feature_names[i]}: {importances[i]:.4f}")
                except:
                    pass

# -------------------- View Results --------------------
for target in target_cols:
    print(f"\n=== {target} Model Comparison ===")
    df_result = pd.DataFrame(results[target]).T
    print(df_result.sort_values("F1-Score", ascending=False).round(3))



=== Evaluating: Overall Risk ===

>> Top Features for Random Forest (Overall Risk)
num__Diastolic BP (mmHg): 0.1528
num__Systolic BP (mmHg): 0.1334
num__BMI: 0.0959
num__Weight (kg): 0.0914
num__Blood Glucose (mg/dL): 0.0747
num__Heart Rate (bpm): 0.0561
num__Height (cm): 0.0334
num__Age: 0.0329
num__Temperature (°C): 0.0278
num__Waist: 0.0210

>> Top Features for XGBoost (Overall Risk)
num__Height (cm): 0.1403
num__Diastolic BP (mmHg): 0.1192
num__Alcohol: 0.1101
num__Systolic BP (mmHg): 0.1047
num__Weight (kg): 0.0808
num__Diet: 0.0757
num__Stress Level: 0.0390
num__Blood Glucose (mg/dL): 0.0369
num__Waist: 0.0367
num__CVD_Family: 0.0308

>> Top Features for Gradient Boosting (Overall Risk)
num__Diastolic BP (mmHg): 0.3481
num__Systolic BP (mmHg): 0.2213
num__Blood Glucose (mg/dL): 0.1012
num__Weight (kg): 0.0583
num__Waist: 0.0373
cat__WaistHeightRatio_0.533333333: 0.0337
num__BMI: 0.0318
cat__WaistHeightRatio_0.511627907: 0.0222
num__Age: 0.0197
num__Height (cm): 0.0189

>> Top Fe