# 🩺 Stroke Prediction: End-to-End Machine Learning Pipeline

# 1. Import Libraries
Import all necessary libraries for data processing, modeling, and visualization

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score,cross_val_predict
from sklearn.preprocessing import LabelEncoder,StandardScaler,RobustScaler
from sklearn.metrics import f1_score, classification_report,ConfusionMatrixDisplay, confusion_matrix,roc_auc_score,roc_curve, auc
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import StackingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import shap
import joblib
import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings('ignore')

**Tensorflow imports**

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight

In [5]:
# Configuration file for paths and constants
sys.path.append(os.path.abspath(".."))
from config import DATA_RAW_PATH, DATA_PREDICTED_PATH,DATA_PROCESSED_PATH, MODEL_DIR, TRAIN_FILE, TEST_FILE,TRAIN_PREPROCESSED_FILE,TEST_PREPROCESSED_FILE, SAMPLE_SUBMISSION_FILE, RANDOM_STATE,SCALING_FILE_NAME
from src.file_handler import FileHandler
from src.data_processing import DataPreprocessor

## 2. Dataset Meta Information

**Files:**
- `train.csv`: Data containing all available features and the stroke response.
- `test.csv`: Unseen stroke patients with all available predictors but missing stroke. Used for model prediction.
- `sample_submission.csv`: A sample submission file in the correct format.

**Data Dictionary:**
| Column            | Type     | Description                                                                                 | Values/Range                        |
|-------------------|----------|---------------------------------------------------------------------------------------------|-------------------------------------|
| `id`              | Integer  | Unique identifier for each record                                                            | Any integer                         |
| `gender`          | String   | Gender of the patient                                                                       | `Male`, `Female`                    |
| `age`             | Float    | Age of the patient in years                                                                 | Any positive float                  |
| `hypertension`    | Integer  | Indicates whether the patient has hypertension                                              | `0`: No, `1`: Yes                   |
| `heart_disease`   | Integer  | Indicates whether the patient has heart disease                                             | `0`: No, `1`: Yes                   |
| `ever_married`    | String   | Marital status of the patient                                                               | `Yes`, `No`                         |
| `work_type`       | String   | Type of employment                                                                          | `Private`, `Self-employed`, `Govt_job`, `Children`, etc. |
| `Residence_type`  | String   | Type of residence                                                                          | `Urban`, `Rural`                    |
| `avg_glucose_level`| Float   | Average glucose level of the patient (mg/dL)                                                | Any positive float                  |
| `bmi`             | Float    | Body Mass Index (BMI) of the patient                                                        | Any positive float                  |
| `smoking_status`  | String   | Smoking status                                                                             | `never smoked`, `formerly smoked`, `smokes`, `Unknown` |
| `stroke`          | Integer  | Target variable indicating whether the patient had a stroke (**[TARGET]**)                  | `0`: No, `1`: Yes                   |

# 3. Load Data
 Load training and test datasets

In [6]:
# Load the training data
train = FileHandler.read_data(DATA_RAW_PATH, TRAIN_FILE)
# Load the test data
test = FileHandler.read_data(DATA_RAW_PATH, TEST_FILE)
# Load the sample submission file
sample_solution = FileHandler.read_data(DATA_RAW_PATH, SAMPLE_SUBMISSION_FILE)

# Create copies of the original train and test data
# This is useful for keeping the original data intact for future reference or comparisons
orgnigal_train=train.copy(deep=True)
orgnigal_test=test.copy(deep=True)

# 4. Categorizing features based on data types

In [8]:
# Categorical and numerical columns
# These lists categorize the features based on their data types
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
cat_binary_cols = ['hypertension', 'heart_disease']
num_cols= ['age', 'avg_glucose_level', 'bmi']

# 5. Data Preprocessing
 

In [10]:
# Combine train and test data for preprocessing
train['is_train'] = 1
test['is_train'] = 0
test['stroke'] = np.nan
full_data = pd.concat([train, test], ignore_index=True)

In [11]:
# Fill missing values in 'bmi' with the median and 'smoking_status' with 'Unknown'
full_data['bmi'] = full_data['bmi'].fillna(full_data['bmi'].median())
full_data['smoking_status'] = full_data['smoking_status'].fillna('Unknown')

# 6. Feature Engineering
**Encode categorical variables using One HOT Encodering**

In [12]:
# Map binary categorical variables to numerical values
binary_map = {'Yes': 1, 'No': 0, 'Urban': 1, 'Rural': 0}
#,'Male':1,'Female':0,'Other':0
full_data['ever_married'] = full_data['ever_married'].map(binary_map)
full_data['Residence_type'] = full_data['Residence_type'].map(binary_map)
# full_data['gender'] = full_data['gender'].map(binary_map)

# one_hot_encode_features=['work_type', 'smoking_status']
one_hot_encode_features=['gender', 'work_type', 'smoking_status']
# One-hot encode categorical variables
full_data = pd.get_dummies(full_data, columns=one_hot_encode_features, drop_first=True)

In [13]:
# Display the first few rows of the processed data
full_data.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,is_train,gender_Male,gender_Other,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,50.0,0,0,1,0,88.1,29.1,0.0,1,True,False,False,True,False,False,True,False,False
1,2,52.0,0,0,1,0,80.07,38.9,0.0,1,False,False,False,True,False,False,False,False,True
2,3,26.0,0,0,0,1,89.11,23.3,0.0,1,False,False,False,False,False,False,False,False,True
3,4,37.0,0,0,1,0,81.36,36.1,0.0,1,False,False,False,True,False,False,False,True,False
4,5,59.0,0,0,1,0,82.59,29.6,1.0,1,True,False,False,True,False,False,False,True,False


In [14]:
# Split the combined data back into train and test sets.len. 'is_train' column indicates whether the row is from the training set (1) or
train = full_data[full_data['is_train'] == 1].drop(['is_train'], axis=1)
test = full_data[full_data['is_train'] == 0].drop(['is_train', 'stroke'], axis=1)

# 7. Feature Selection
**Select features for modeling (exclude 'id' and 'stroke')**

In [15]:
features = [col for col in train.columns if col not in ['id', 'stroke']]
X = train[features]
y = train['stroke'].astype(int)

# 8. Handle Class Imbalance
**Apply SMOTE to balance the target classes**

In [16]:
smote = SMOTE(random_state=RANDOM_STATE, sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(X, y)

# 9. functions

In [17]:
def scale_features(scaler,X_train, X_val, X_test,features=['bmi', 'avg_glucose_level']):
    """
    Scales features using defined scaler. Returns scaled train, val, and optionally test sets.
    """
    # scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train[features])
    X_val_scaled = scaler.transform(X_val[features])
    # if X_test is not None:
    X_test_scaled = scaler.transform(X_test[features])

    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

# 10. Train/Validation Split
**Split the resampled data into training and validation sets**

In [18]:
X_train, X_val, y_train, y_val=DataPreprocessor.split_data(X_resampled, y_resampled, test_size=0.2,stratify=y_resampled, random_state=RANDOM_STATE)

In [19]:
def build_model(input_dim,learning_rate=0.0005):
    """
    Build and compile a simple deep neural network for binary classification.
    """
    model = Sequential()
    model.add(Dense(256, input_dim=input_dim))
    model.add(LeakyReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(128))
    model.add(LeakyReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(64))
    model.add(LeakyReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['AUC', 'accuracy','f1_score'])
    return model

In [20]:
def train_model(model, X_train, y_train, X_val, y_val,
                epochs=50, batch_size=32, verbose=1):
    """
    Train the Keras model on the provided data.
    """
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=verbose)
    return model, history


In [None]:
# def evaluate_model(model, X_val, y_val):
#     """
#     Evaluate the trained model and return F1 and ROC-AUC scores.
#     """
#     y_pred = (model.predict(X_val) > 0.5).astype("int32")
#     f1 = f1_score(y_val, y_pred)
#     roc_auc = roc_auc_score(y_val, y_pred)
#     return f1, roc_auc

In [None]:
def train_deep_model(X_train,y_train, X_val,y_val):

    # Class Weights
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weight = dict(zip(np.unique(y_train), weights))

    # Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    model = build_model(X_train.shape[1])
    history=model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=100,
              batch_size=32,
              class_weight=class_weight,
              callbacks=[early_stop, lr_scheduler],
              verbose=1)

    # Print losses
    # print("\nTraining/Validation Loss per Epoch:")
    # for epoch, (train_loss, val_loss) in enumerate(
    #     zip(history.history['loss'], history.history['val_loss']), 1):
    #     print(f"Epoch {epoch:03d}: train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")
    return model

In [30]:
def evaludate_model(model,X_val,y_val):
    y_pred_proba = model.predict(X_val).ravel()
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_f1, best_thresh = 0, 0.5
    for t in thresholds:
        f1 = f1_score(y_val, (y_pred_proba > t).astype(int))
        if f1 > best_f1:
            best_f1, best_thresh = f1, t

    final_f1 = f1_score(y_val, (y_pred_proba > best_thresh).astype(int))
    roc = roc_auc_score(y_val, y_pred_proba)
    print(f"Best Threshold: {best_thresh:.2f} | F1 Score: {final_f1:.4f} | ROC AUC: {roc:.4f}")
    return model, best_thresh,final_f1,roc

# Train and validate

In [31]:
model=train_deep_model(X_train, y_train,X_val, y_val)
final_model, best_thresh,final_f1,roc=evaludate_model(model,X_val,y_val)

Epoch 1/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - AUC: 0.8500 - accuracy: 0.7744 - f1_score: 0.6685 - loss: 0.5112 - val_AUC: 0.9325 - val_accuracy: 0.8149 - val_f1_score: 0.6743 - val_loss: 0.4052 - learning_rate: 5.0000e-04
Epoch 2/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.9258 - accuracy: 0.8484 - f1_score: 0.6726 - loss: 0.3516 - val_AUC: 0.9491 - val_accuracy: 0.8202 - val_f1_score: 0.6743 - val_loss: 0.4025 - learning_rate: 5.0000e-04
Epoch 3/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.9354 - accuracy: 0.8562 - f1_score: 0.6636 - loss: 0.3254 - val_AUC: 0.9555 - val_accuracy: 0.8807 - val_f1_score: 0.6743 - val_loss: 0.2722 - learning_rate: 5.0000e-04
Epoch 4/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.9424 - accuracy: 0.8673 - f1_score: 0.6681 - loss: 0.3059 - val_AUC: 0.9532 - val_accuracy: 0.8354 - val_f1_s

In [32]:
# Scale features before training models
robust_scaler = RobustScaler()
standard_scaler = StandardScaler()
X_train_scaled, X_val_scaled,X_test,feature_scaler = scale_features(standard_scaler,X_train, X_val,test,features=['bmi', 'avg_glucose_level'])
# save_pickle_file(feature_scaler, SCALING_FILE_NAME,MODEL_DIR)

In [33]:
# on the scaled data, train the deep learning model
# This function will train the model and return the trained model, best threshold, final F1 score, and ROC AUC score
model=train_deep_model(X_train_scaled, y_train,X_val_scaled, y_val)
final_model, best_thresh,final_f1,roc=evaludate_model(model,X_val_scaled,y_val)

Epoch 1/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - AUC: 0.6210 - accuracy: 0.5760 - f1_score: 0.6622 - loss: 0.7299 - val_AUC: 0.6769 - val_accuracy: 0.6053 - val_f1_score: 0.6743 - val_loss: 0.6255 - learning_rate: 5.0000e-04
Epoch 2/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.6336 - accuracy: 0.5819 - f1_score: 0.6700 - loss: 0.6700 - val_AUC: 0.6880 - val_accuracy: 0.6300 - val_f1_score: 0.6743 - val_loss: 0.6170 - learning_rate: 5.0000e-04
Epoch 3/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.6593 - accuracy: 0.5996 - f1_score: 0.6640 - loss: 0.6463 - val_AUC: 0.6869 - val_accuracy: 0.6207 - val_f1_score: 0.6743 - val_loss: 0.6186 - learning_rate: 5.0000e-04
Epoch 4/100
[1m587/587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - AUC: 0.6556 - accuracy: 0.6008 - f1_score: 0.6613 - loss: 0.6427 - val_AUC: 0.6876 - val_accuracy: 0.6247 - val_f1_s

# Summary of the model
* F1 score is greater than 90%, but we can improve it by tuning the hyperparameters, adding more layers, or using different activation functions.
* ROC AUC score is good, which indicates that the model is able to distinguish between the two classes well.