In [78]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
#Importing utilized libraries

In [80]:
data = pd.read_csv('C:\\Users\\Wellington\\Downloads\\Stroke\\healthcare-dataset-stroke-data.csv')
#Reading the dataset
data
#Viewing the dataset

Unnamed: 0,id,Gender,Age,Hypertension,HeartDisease,EverMarried,WorkType,ResidenceType,AvgGlucoseLevel,BMI,SmokingStatus,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [82]:
data.isnull().sum()
#Checking for missing data

id                   0
Gender               0
Age                  0
Hypertension         0
HeartDisease         0
EverMarried          0
WorkType             0
ResidenceType        0
AvgGlucoseLevel      0
BMI                201
SmokingStatus        0
Stroke               0
dtype: int64

In [84]:
data.columns = data.columns.str.lower()
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data['stroke'].value_counts()
#Checking how many patients within the dataset have suffered a stroke, and imputing missing BMI data

stroke
0    4861
1     249
Name: count, dtype: int64

In [86]:
data.isnull().sum()
#Double checking for missing data

id                 0
gender             0
age                0
hypertension       0
heartdisease       0
evermarried        0
worktype           0
residencetype      0
avgglucoselevel    0
bmi                0
smokingstatus      0
stroke             0
dtype: int64

In [88]:
for feature in data.columns:
  print(f'{feature}:', data[feature].unique())
#Viewing all unique values possible within the dataset

id: [ 9046 51676 31112 ... 19723 37544 44679]
gender: ['Male' 'Female' 'Other']
age: [6.70e+01 6.10e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01
 5.90e+01 7.80e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 5.70e+01
 7.10e+01 5.20e+01 8.20e+01 6.50e+01 5.80e+01 4.20e+01 4.80e+01 7.20e+01
 6.30e+01 7.60e+01 3.90e+01 7.70e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01
 6.60e+01 5.10e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01
 1.32e+00 4.60e+01 3.20e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01
 3.50e+01 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01
 4.00e+00 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01
 3.30e+01 2.40e+01 3.40e+01 3.60e+01 6.40e-01 4.10e+01 8.80e-01 5.00e+00
 2.60e+01 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01
 2.80e+01 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00
 1.00e+00 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00
 1.24e+00 8.00e-01 4.00e-01 8.00e-02 1.

In [90]:
df = data.copy()
bins = [0, 18, 30, 40, 50, 100]  #Defining age groups
labels = ['0-18', '19-30', '31-40', '41-50', '51+']  #Labels for the age group bins
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)
df['age_group']

0         51+
1         51+
2         51+
3       41-50
4         51+
        ...  
5105      51+
5106      51+
5107    31-40
5108      51+
5109    41-50
Name: age_group, Length: 5110, dtype: category
Categories (5, object): ['0-18' < '19-30' < '31-40' < '41-50' < '51+']

In [92]:
label_encoder = LabelEncoder()

df.columns = df.columns.str.lower()

cat_columns = ['gender','evermarried','worktype','residencetype','smokingstatus','age_group']
for column in cat_columns:
    df[column] = label_encoder.fit_transform(df[column])

Y = df['stroke']
X = df.drop(columns = ['stroke','id','age'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) #Splitting the dataset into testing and training
#Initialize SMOTE
smote = SMOTE(random_state=42)

#Fitting and applying SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [94]:
X_train_resampled

Unnamed: 0,gender,hypertension,heartdisease,evermarried,worktype,residencetype,avgglucoselevel,bmi,smokingstatus,age_group
0,1,0,0,0,4,0,90.420000,16.200000,0,0
1,1,0,0,0,2,1,207.580000,22.800000,3,1
2,1,1,0,1,2,0,91.280000,26.500000,2,3
3,0,1,0,1,2,1,150.740000,40.300000,0,4
4,0,0,0,1,2,0,82.570000,36.000000,1,4
...,...,...,...,...,...,...,...,...,...,...
6829,1,0,0,0,1,0,217.896120,28.320992,0,4
6830,1,1,0,1,2,0,247.301388,27.934177,2,4
6831,0,0,1,0,2,0,60.668965,29.473962,1,4
6832,0,0,0,1,1,1,110.723110,25.791870,0,4


In [98]:
#Defining Classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
   # 'Neural Network': MLPClassifier(max_iter=500)  # Setting higher iterations for convergence
    #The Neural network code was commented out due to being absurdly computationally expensive. 
}

#Establishing our models' parameters
param_grid = {
    'Random Forest': {'n_estimators': [10, 50, 100], 'max_depth': [10, 20]},

    'XGBoost': {
        'n_estimators': [50, 100, 200], 
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 6, 9]
    }
#Add a comma above if you want to include the Neural Network model in the results
    #'Neural Network': {
       # 'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Different NN architectures
       # 'activation': ['relu', 'tanh'],  # Activation functions
       # 'alpha': [0.0001, 0.001, 0.01],  # Regularization strength
       # 'learning_rate_init': [0.001, 0.01]  # Learning rate
    #}
}


best_estimators = {}

for clf_name, clf in classifiers.items():
    print(f"Performing GridSearchCV for {clf_name}...")

    # Use GridSearchCV for parameter tuning
    grid_search = GridSearchCV(clf, param_grid[clf_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Store the best estimator
    best_estimators[clf_name] = grid_search.best_estimator_

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    print(f"Best Model for {clf_name}: {grid_search.best_params_}")
    print(f"Accuracy for {clf_name}: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {clf_name}:\n{classification_report(y_test, y_pred)}")

    print("-" * 50)
best_model_name = max(best_estimators, key=lambda clf: accuracy_score(y_test, best_estimators[clf].predict(X_test)))
print(f"Overall best model: {best_model_name}")

Performing GridSearchCV for Random Forest...
Best Model for Random Forest: {'max_depth': 20, 'n_estimators': 50}
Accuracy for Random Forest: 0.8858447488584474
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1444
           1       0.15      0.21      0.18        89

    accuracy                           0.89      1533
   macro avg       0.55      0.57      0.56      1533
weighted avg       0.90      0.89      0.89      1533

--------------------------------------------------
Performing GridSearchCV for XGBoost...


Parameters: { "use_label_encoder" } are not used.



Best Model for XGBoost: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200}
Accuracy for XGBoost: 0.8851924331376386
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1444
           1       0.13      0.18      0.15        89

    accuracy                           0.89      1533
   macro avg       0.54      0.55      0.55      1533
weighted avg       0.90      0.89      0.89      1533

--------------------------------------------------
Overall best model: Random Forest
