In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings("ignore")

# -------------------- Step 1: Load Data --------------------
df = pd.read_csv("D:\\jilo_new.csv", encoding='latin1')
df.drop(columns=[col for col in df.columns if 'id' in col.lower()], inplace=True)

target_cols = ['Overall Risk', 'Cardiac Risk', 'Diabetic Risk', 'Hypertension Risk']
df = df.dropna(subset=target_cols)
for col in target_cols:
    df = df[~df[col].str.lower().isin(['unknown'])]

X_raw = df.drop(columns=target_cols)
for col in X_raw.select_dtypes(include='object').columns:
    X_raw[col] = LabelEncoder().fit_transform(X_raw[col].astype(str))
X_raw = X_raw.fillna(X_raw.median(numeric_only=True))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": lgb.LGBMClassifier()
}

results = {}

for target in target_cols:
    print(f"\n=== Model Performance for {target} ===")
    le = LabelEncoder()
    y_full = le.fit_transform(df[target])
    labels = np.unique(y_full)

    # Manually split each class into train/test
    X_train_list, y_train_list, X_test_list, y_test_list = [], [], [], []
    for label in labels:
        idx = np.where(y_full == label)[0]
        np.random.shuffle(idx)
        split = int(0.8 * len(idx))
        train_idx, test_idx = idx[:split], idx[split:]

        X_train_list.append(X_scaled[train_idx])
        y_train_list.append(y_full[train_idx])
        X_test_list.append(X_scaled[test_idx])
        y_test_list.append(y_full[test_idx])

    X_train = np.vstack(X_train_list)
    y_train = np.concatenate(y_train_list)
    X_test = np.vstack(X_test_list)
    y_test = np.concatenate(y_test_list)

    results[target] = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        try:
            auc = roc_auc_score(y_test, y_proba, multi_class='ovr', labels=labels) if y_proba is not None else np.nan
        except:
            auc = np.nan

        print(f"{name}:")
        print(f"  Accuracy     : {accuracy:.3f}")
        print(f"  Precision    : {precision:.3f}")
        print(f"  Recall       : {recall:.3f}")
        print(f"  F1-Score     : {f1:.3f}")
        print(f"  ROC AUC (OVR): {auc:.3f}")
        print()

        results[target][name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "ROC AUC": auc
        }

# Convert results to DataFrames
final_results = {target: pd.DataFrame(results[target]).T for target in target_cols}

# Print final summary
for target, df_result in final_results.items():
    print(f"\n== Final Summary for {target} ==")
    print(df_result.round(3))



=== Model Performance for Overall Risk ===
Logistic Regression:
  Accuracy     : 0.625
  Precision    : 0.384
  Recall       : 0.415
  F1-Score     : 0.390
  ROC AUC (OVR): 0.786

Random Forest:
  Accuracy     : 0.600
  Precision    : 0.359
  Recall       : 0.387
  F1-Score     : 0.367
  ROC AUC (OVR): 0.848

XGBoost:
  Accuracy     : 0.575
  Precision    : 0.375
  Recall       : 0.410
  F1-Score     : 0.373
  ROC AUC (OVR): 0.842

CatBoost:
  Accuracy     : 0.600
  Precision    : 0.372
  Recall       : 0.404
  F1-Score     : 0.372
  ROC AUC (OVR): 0.800

Gradient Boosting:
  Accuracy     : 0.525
  Precision    : 0.349
  Recall       : 0.362
  F1-Score     : 0.349
  ROC AUC (OVR): 0.829

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 448
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 30
[LightGBM] [Info