# **Stroke Prediction Model**

## EDA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn import metrics


data = pd.read_csv('./train_strokes.csv')
df = data.copy()
df.head()

In [None]:
df.info()

In [None]:
# uniform format
df.columns = df.columns.str.lower()

# check for duplicate data based on ['id']
df.duplicated('id').sum()

In [None]:
# feature ['id'] is not needed for analysis
df.drop(['id'], axis=1, inplace=True)
df.columns

In [None]:
df.isnull().sum()

In [None]:
numeric_cols = ['age', 'avg_glucose_level', 'bmi']
categorical_cols = df.columns.drop(numeric_cols)

# check unique values of categorical variables
for col in df[categorical_cols]:
    print(f'{col} : {df[col].unique()}')

In [None]:
# check ratio of target feature ['stroke']
print(df.stroke.value_counts(), df.stroke.value_counts(normalize=True)*100, sep='\n\n')

# Imbalanced data for target variable ['stroke']

## Data Visualization

### Stroke by Age, Average Glucose Level, and BMI

In [None]:
# numeric_cols = ['age', 'avg_glucose_level', 'bmi']
# categorical_cols = 'gender', 'hypertension', 'heart_disease', 'ever_married',
#                    'work_type','residence_type', 'smoking_status', 'stroke']

fig, axs = plt.subplots(3, 3, figsize=(15,10))
sns.set_theme()

for i, col in enumerate(numeric_cols):
    sns.histplot(df, x=col, hue='stroke', kde=True, bins=25, multiple='stack', ax=axs[i, 0])
    sns.boxplot(df, x=col, ax=axs[i, 1], )
    sns.scatterplot(df, x=col, y='stroke', ax=axs[i,2])

plt.tight_layout()

### Stroke by Categorical Variables

In [None]:
fig, axs = plt.subplots(len(categorical_cols)//4, 4, figsize=(15, 10))

for i, col in enumerate(categorical_cols):
        ax = sns.countplot(df, x=col, hue='stroke', ax=axs[i // 4, i % 4])
        ax.set_title(col)
        ax.tick_params(axis='x', rotation=30)

        for container in ax.containers:
            ax.bar_label(container)

plt.tight_layout()

## Modeling

### Data Preprocessing

In [None]:
#check for missing values
df.isnull().sum()

In [None]:
# check values in ['gender']
print(df['gender'].value_counts())

# drop 'Other' values in ['gender'] due to insufficient data
df = df[df['gender']!='Other']

df['gender'].value_counts()

In [None]:
# visualization of ['bmi']
sns.boxplot(x=df['bmi'])
print(df['bmi'].describe())

# due to outliers, replace missing values with median
median = df['bmi'].median()
print(f'\nmedian: {median}')

In [None]:
# replace missing data in ['bmi'] with median
df['bmi'].fillna(median, inplace=True)

# reassign missing values of ['smoking_status'] as 'unknown' instead of dropping
df['smoking_status'].fillna('unknown', inplace=True)

df.isnull().sum()

In [None]:
# categorize numerical variables based on criteria by CDC
def categorize_num_var(df):
    # categorize ['avg_glucose_level']
    df['avg_glucose_level'] = pd.cut(x=df['avg_glucose_level'], 
                                     bins=[0, 100, 126, np.inf], 
                                     labels=['normal', 'prediabetic', 'diabetic'])
    # categorize ['bmi']
    df['bmi'] = pd.cut(x=df['bmi'],
                       bins=[0, 18.5, 25, 30, np.inf],
                       labels=['underweight', 'normal', 'overweight', 'obese'])
    
categorize_num_var(df)

# check unique values of categorical variables
for col in df.drop(['age'], axis=1):
    print(f'{col} : {df[col].unique()}')

In [None]:
# change data into adequate data types
def convert_dtypes(df):
    for col in df.drop('age', axis=1):
        df[col] = df[col].astype('category')
    df['age'] = df['age'].astype('int')

    return df.dtypes

convert_dtypes(df)

### Modeling - Without Resampling

In [None]:
# differentiate feature variables & target variable
X = df.drop('stroke', axis=1)
y = df['stroke']


"""It is important to split data before resampling to retain original data distribution.
Resampling to adjust data imbalance is only applied to the train data to improve the accuracy of ML models, not the test data."""
# split train & validation data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0, stratify=y)

print(f"Train dataset-{y_train.value_counts()} \n\nTest dataset-{y_test.value_counts()} \n")
print(f"Train dataset-{y_train.value_counts(normalize=True)*100} \n\nTest dataset-{y_test.value_counts(normalize=True)*100}")

In [None]:
# scale & encode appropriate variables
transformer = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['age']),
        ('cat', OrdinalEncoder(), X_train.columns.drop('age'))
    ],
    remainder='passthrough'
)

X_train_transformed = pd.DataFrame(transformer.fit_transform(X_train), columns=transformer.get_feature_names_out())
X_test_transformed = pd.DataFrame(transformer.transform(X_test), columns=transformer.get_feature_names_out()) 

X_train_transformed

In [None]:
models = {
    "SVC": SVC(probability=True, random_state=0),
    "LogisticRegression": LogisticRegression(random_state=0),
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=0),
    "RandomForest": RandomForestClassifier(random_state=0, n_jobs=-1),
    "KNearestNeighbors": KNeighborsClassifier(n_jobs=-1),
    "MLPClassifier": MLPClassifier(random_state=0),
    "XGBoosting": XGBClassifier(eval_metric='auc', random_state=0, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=0),
    "SGDClassifier": SGDClassifier(loss='log_loss',random_state=0, n_jobs=-1),
    "AdaBoost": AdaBoostClassifier(random_state=0)
    
}

for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    print(f"{name} trained.")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test_transformed)
    print('-'*80)
    # classification report
    print(f"Model Performance - {name} \n\n {classification_report(y_test, y_pred, zero_division=0)}")
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"{name}", fontsize= 15, weight='bold')
    plt.show()

In [None]:
for name, model in models.items():
    #ROC-AUC Curve
    y_pred_proba = model.predict_proba(X_test_transformed)[:, 1]
    fpr, tpr, threshholds = roc_curve(y_test, y_pred_proba)

    plt.plot(fpr,tpr)
    plt.title(name, fontsize=15, weight='bold')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.show()

    print(f"{name} - AUC Score: {metrics.roc_auc_score(y_test, y_pred_proba)}")

In [None]:
for name, model in models.items():
    #ROC-AUC Curve
    y_pred_proba = model.predict_proba(X_test_transformed)[::, 1]
    fpr, tpr, threshholds = roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba).round(4)
    plt.plot(fpr,tpr, label=f"{name}, AUC={auc}")

    # plt.ylabel('TPR')
    # plt.xlabel('FPR')
    # plt.show()

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('AUC-ROC Curve Performance', fontsize=15, weight='bold')
plt.legend(loc=(1.04, 0))
plt.show()
plt.tight_layout()

### Modeling - With Resampling(SMOTE)

In [None]:
from imblearn.over_sampling import SMOTE

X_train_sampled, y_train_sampled = SMOTE(random_state=0).fit_resample(X_train, y_train)

print(y_train.value_counts())
# X_train_transformed = pd.DataFrame(transformer.fit_transform(X_train_sampled), columns=transformer.get_feature_names_out())
# X_test_transformed = pd.DataFrame(transformer.transform(X_test), columns=transformer.get_feature_names_out())

# X_train_transformed.head()

In [None]:
# differentiate feature variables & target variable
X = df.drop('stroke', axis=1)
y = df['stroke']


"""It is important to split data before resampling to retain original data distribution.
Resampling to adjust data imbalance is only applied to the train data to improve the accuracy of ML models, not the test data."""
# split train & validation data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0, stratify=y)

print(f"Train dataset-{y_train.value_counts()} \n\nTest dataset-{y_test.value_counts()} \n")
print(f"Train dataset-{y_train.value_counts(normalize=True)*100} \n\nTest dataset-{y_test.value_counts(normalize=True)*100}")