In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# -------------------- Load Data --------------------
df = pd.read_csv("D:\\jilo_new.csv", encoding='latin1')

# Drop ID columns
df.drop(columns=[col for col in df.columns if 'id' in col.lower()], inplace=True)

# -------------------- Gender Column Processing --------------------
gender_col = [col for col in df.columns if 'gender' in col.lower()]
if gender_col:
    gender_col = gender_col[0]
    df[gender_col] = df[gender_col].astype(str).str.lower().str.strip()
    df[gender_col] = df[gender_col].apply(lambda x: 1 if x in ['male', 'm'] else (0 if x in ['female', 'f'] else np.nan))

# -------------------- Target Cleaning --------------------
target_cols = ['Overall Risk', 'Cardiac Risk', 'Diabetic Risk', 'Hypertension Risk']
df = df.dropna(subset=target_cols)
for col in target_cols:
    df = df[~df[col].astype(str).str.lower().isin(['unknown', 'not assessed', 'na', ''])]

# -------------------- Fill Missing with Mode --------------------
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode(dropna=True)[0])

# -------------------- Preprocessing --------------------
X_raw = df.drop(columns=target_cols)
categorical_cols = X_raw.select_dtypes(include='object').columns.tolist()
numeric_cols = X_raw.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

# -------------------- Models --------------------
base_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=150, max_depth=15, class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', max_depth=6),
    "SVC": SVC(probability=True, kernel='rbf', class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=150),
    "CatBoost": CatBoostClassifier(verbose=0),
    "LightGBM": lgb.LGBMClassifier()
}

# -------------------- Evaluation --------------------
results = {}

for target in target_cols:
    print(f"\n=== Evaluating: {target} ===")
    y = df[target].astype('category').cat.codes
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, stratify=y, random_state=42)

    X_train_prep = preprocessor.fit_transform(X_train)
    X_test_prep = preprocessor.transform(X_test)

    results[target] = {}
    for name, model_base in base_models.items():
        model = OneVsRestClassifier(model_base)
        model.fit(X_train_prep, y_train)
        y_pred = model.predict(X_test_prep)

        try:
            y_proba = model.predict_proba(X_test_prep)
            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
        except:
            auc = np.nan

        results[target][name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
            "F1-Score": f1_score(y_test, y_pred, average='macro', zero_division=0),
            "ROC AUC": auc
        }

# -------------------- View Results --------------------
for target in target_cols:
    print(f"\n=== {target} Model Comparison ===")
    df_result = pd.DataFrame(results[target]).T
    print(df_result.sort_values("F1-Score", ascending=False).round(3))



=== Evaluating: Overall Risk ===
[LightGBM] [Info] Number of positive: 91, number of negative: 60
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.602649 -> initscore=0.416515
[LightGBM] [Info] Start training from score 0.416515
[LightGBM] [Info] Number of positive: 27, number of negative: 124
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178808 -> initscore=-1.524445
[LightGBM] [Info] Start training from score -1.5244