In [1]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
# ============================================
# Stroke Prediction - Feature Engineering & Models
# ============================================

import pandas as pd
import numpy as np
import pickle, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# =====================
# 1. Load Data
# =====================
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# =====================
# 2. Handle Missing Values
# =====================
# Fill numeric missing with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical missing with mode
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# =====================
# 3. Feature Engineering
# =====================
# BMI bands
if "bmi" in df.columns:
    df["BMI_Band"] = pd.cut(df["bmi"], bins=[0, 18.5, 25, 30, 40, 100], labels=False)

# Age bands
df["AgeBand"] = pd.cut(df["age"], bins=[0, 18, 30, 45, 60, 120], labels=False)

# Glucose ratio (average glucose level per age)
if "avg_glucose_level" in df.columns:
    df["GlucosePerAge"] = df["avg_glucose_level"] / (df["age"] + 1)

# Income proxy: BMI * glucose (arbitrary feature interaction)
if "bmi" in df.columns and "avg_glucose_level" in df.columns:
    df["BMI_Glucose"] = df["bmi"] * df["avg_glucose_level"]

# Scale numeric features
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

# =====================
# 4. Split Data
# =====================
X = df.drop("stroke", axis=1)
y = df["stroke"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =====================
# 5. Define Models
# =====================
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=6, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42
    ),
    "CatBoost": CatBoostClassifier(iterations=100, verbose=0, random_state=42),
}

# =====================
# 6. Train, Evaluate, Save
# =====================
os.makedirs("models_stroke", exist_ok=True)
results = {}
best_model_name = None
best_model = None
best_acc = 0.0

for name, model in models.items():
    print(f"\n================= {name} =================")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"✅ Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    # Save individual model
    with open(f"models_stroke/{name.replace(' ', '_')}.pkl", "wb") as f:
        pickle.dump(model, f)
    if acc > best_acc:
        best_acc = acc
        best_model = model
        best_model_name = name

# Save best model
with open("models_stroke/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# =====================
# 7. Summary
# =====================
print("\n🔎 Model Performance Summary:")
for model, acc in results.items():
    print(f"{model:15s} -> Accuracy: {acc:.4f}")

print(f"\n🏆 Best Model: {best_model_name} with Accuracy: {best_acc:.4f}")
print("✅ Best model saved as models_stroke/best_model.pkl")



✅ Accuracy: 0.9452
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       972
         1.0       0.12      0.02      0.03        50

    accuracy                           0.95      1022
   macro avg       0.54      0.51      0.50      1022
weighted avg       0.91      0.95      0.93      1022


✅ Accuracy: 0.9491
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97       972
         1.0       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022


✅ Accuracy: 0.9521
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       972
         1.0       1.00      0.02      0.04        50

    accuracy                           0.95      1022
   macro avg       0.98      0.51      0.51      1022
weighted avg  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.9442
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       972
         1.0       0.23      0.06      0.10        50

    accuracy                           0.94      1022
   macro avg       0.59      0.52      0.53      1022
weighted avg       0.92      0.94      0.93      1022


✅ Accuracy: 0.9501
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97       972
         1.0       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022


🔎 Model Performance Summary:
Decision Tree   -> Accuracy: 0.9452
Random Forest   -> Accuracy: 0.9491
AdaBoost        -> Accuracy: 0.9521
XGBoost         -> Accuracy: 0.9442
CatBoost        -> Accuracy: 0.9501

🏆 Best Model: AdaBoost with Accuracy: 0.9521
✅ Best model saved as models_stroke/best_model.pkl
