In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings("ignore")

# -------------------- Step 1: Load Data --------------------
df = pd.read_csv("D:\\jilo_new.csv", encoding='latin1')

# Drop patient ID columns
df.drop(columns=[col for col in df.columns if 'id' in col.lower()], inplace=True)

# Define target columns
target_cols = ['Overall Risk', 'Cardiac Risk', 'Diabetic Risk', 'Hypertension Risk']

# Remove rows with missing/unknown targets
df = df.dropna(subset=target_cols)
for col in target_cols:
    df = df[~df[col].str.lower().isin(['unknown'])]

# -------------------- Step 2: Preprocessing --------------------
X = df.drop(columns=target_cols)
y_dict = {col: df[col] for col in target_cols}

# Encode categorical features
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Fill numeric missing values
X = X.fillna(X.median(numeric_only=True))

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------- Step 3: Model Dictionary --------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": lgb.LGBMClassifier()
}

# -------------------- Step 4: Train + Evaluate --------------------
results = {}

for target in target_cols:
    print(f"\n=== Model Performance for {target} ===")
    y = LabelEncoder().fit_transform(y_dict[target])

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, stratify=y, random_state=42
    )

    results[target] = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        # Safe AUC computation
        if y_proba is not None and len(np.unique(y_test)) == y_proba.shape[1]:
            try:
                auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
            except ValueError:
                auc = np.nan
        else:
            auc = np.nan

        print(f"{name}:")
        print(f"  Accuracy     : {accuracy:.3f}")
        print(f"  Precision    : {precision:.3f}")
        print(f"  Recall       : {recall:.3f}")
        print(f"  F1-Score     : {f1:.3f}")
        print(f"  ROC AUC (OVR): {auc:.3f}")
        print()

        results[target][name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "ROC AUC": auc
        }

# -------------------- Step 5: Results to DataFrame --------------------
final_results = {target: pd.DataFrame(results[target]).T for target in target_cols}

# Optional: Display results for each target
for target, df_result in final_results.items():
    print(f"\n== Final Summary for {target} ==")
    print(df_result.round(3))



=== Model Performance for Overall Risk ===
Logistic Regression:
  Accuracy     : 0.564
  Precision    : 0.326
  Recall       : 0.345
  F1-Score     : 0.333
  ROC AUC (OVR): 0.763

Random Forest:
  Accuracy     : 0.692
  Precision    : 0.443
  Recall       : 0.465
  F1-Score     : 0.448
  ROC AUC (OVR): 0.903

XGBoost:
  Accuracy     : 0.667
  Precision    : 0.454
  Recall       : 0.475
  F1-Score     : 0.441
  ROC AUC (OVR): 0.859

CatBoost:
  Accuracy     : 0.615
  Precision    : 0.371
  Recall       : 0.408
  F1-Score     : 0.377
  ROC AUC (OVR): 0.737

Gradient Boosting:
  Accuracy     : 0.667
  Precision    : 0.452
  Recall       : 0.479
  F1-Score     : 0.459
  ROC AUC (OVR): 0.749

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 463
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 30
[LightGBM] [Info