In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("Dataset/healthcare-dataset-stroke-data.csv")

# -----------------------------
# Drop ID column (not useful)
# -----------------------------
df.drop(columns=["id"], inplace=True)

# -----------------------------
# Handle missing BMI values
# -----------------------------
df["bmi"].fillna(df["bmi"].median(), inplace=True)

# -----------------------------
# Encode categorical columns
# -----------------------------
categorical_cols = [
    "gender",
    "ever_married",
    "work_type",
    "Residence_type",
    "smoking_status"
]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# -----------------------------
# Features & Target
# -----------------------------
X = df.drop("stroke", axis=1)
y = df["stroke"]

# -----------------------------
# Train-test split (IMPORTANT: imbalance aware)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -----------------------------
# Feature scaling
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Stroke dataset cleaned correctly")
print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)
print("Stroke distribution:\n", y.value_counts())


✅ Stroke dataset cleaned correctly
Train shape: (4088, 10)
Test shape: (1022, 10)
Stroke distribution:
 stroke
0    4861
1     249
Name: count, dtype: int64


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# -----------------------------
# Train Logistic Regression with class weights
# -----------------------------
stroke_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

stroke_model.fit(X_train_scaled, y_train)

# -----------------------------
# Predictions
# -----------------------------
y_pred = stroke_model.predict(X_test_scaled)
y_prob = stroke_model.predict_proba(X_test_scaled)[:, 1]

# -----------------------------
# Evaluation
# -----------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Accuracy: 0.7514677103718199

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.75      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.77      0.55      1022
weighted avg       0.95      0.75      0.82      1022

ROC-AUC: 0.8386831275720164


In [12]:
import joblib

joblib.dump(stroke_model, "stroke_model.pkl")
joblib.dump(scaler, "stroke_scaler.pkl")

print("✅ Stroke model saved successfully")


✅ Stroke model saved successfully
