
**This report was generated using AI under general human direction. At the time of generation, the contents have not been comprehensively reviewed by a human analyst.**

# Stroke Prediction Analysis (Leak-Proof Pipeline)

## 1. Imports & Setup



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier


## 2. Data Loading & Initial Cleanup



In [None]:
# Load data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Drop ID column and remove 'Other' gender
df = df.drop(columns=["id"])
df = df[df["gender"] != "Other"]

# Separate features and target
X = df.drop("stroke", axis=1)
y = df["stroke"]

# Train/test split (hold-out set for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## 3. Feature Engineering Transformer



In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X["age_group"] = pd.cut(X["age"], bins=[0,18,45,65,100], labels=["child","adult","senior","elderly"])
        X["glucose_age_interaction"] = X["avg_glucose_level"] * X["age"]
        return X


## 4. Pipeline Construction



In [None]:
numeric_features = ["age", "avg_glucose_level", "bmi", "glucose_age_interaction"]
categorical_features = ["gender", "hypertension", "heart_disease", "ever_married",
                        "work_type", "Residence_type", "smoking_status", "age_group"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler()),
    ("imputer", KNNImputer(n_neighbors=5)),
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
], remainder="drop")

model_pipeline = ImbPipeline([
    ("engineer", FeatureEngineer()),
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", RandomForestClassifier(random_state=42)),
])


## 5. Cross-Validation & Model Evaluation



In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_pipeline, X_train, y_train, cv=cv, scoring="roc_auc")
print(f"Cross-Validation AUC Scores: {scores}")
print(f"Mean AUC: {scores.mean():.4f}")

model_pipeline.fit(X_train, y_train)
final_score = model_pipeline.score(X_test, y_test)
print(f"Hold-out Test Accuracy: {final_score:.4f}")


## 6. Results



In [None]:
# Display results
print("Cross-Validation AUC Scores:", scores)
print("Mean AUC:", scores.mean())
print("Hold-out Test Accuracy:", final_score)


---

This notebook ensures all preprocessing and resampling steps are performed inside the pipeline, preventing data leakage and producing reliable model evaluation.
