In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# load dataset
df = pd.read_csv('data.csv')

# drop the column id
df = df.drop(columns=['id'])

# Fill bmi with mean
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

# For each of the categorical attribtues, encode the set of categories to be 0 ~ (n_classes - 1)
cats = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoder = OrdinalEncoder()
df[cats] = encoder.fit_transform(df[cats])
df = pd.get_dummies(df, columns = ["work_type", "Residence_type"], dtype=int) 

# normalize numerical data using the min-max normalization technique
num = ["age", "avg_glucose_level", "bmi"]
scaler = MinMaxScaler(feature_range=(0, 1))
df[num] = scaler.fit_transform(df[num])

# Split data into features and target
X = df.drop('stroke', axis=1)
y = df['stroke']

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
x_os, y_os = smote.fit_resample(X, y)
print(y_os.value_counts())
X.head(5)

stroke
1    4861
0    4861
Name: count, dtype: int64


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,work_type_0.0,work_type_1.0,work_type_2.0,work_type_3.0,work_type_4.0,Residence_type_0.0,Residence_type_1.0
0,1.0,0.816895,0,1,1.0,0.801265,0.30126,1.0,0,0,1,0,0,0,1
1,0.0,0.743652,0,0,1.0,0.679023,0.212981,2.0,0,0,0,1,0,1,0
2,1.0,0.975586,0,1,1.0,0.234512,0.254296,2.0,0,0,1,0,0,1,0
3,0.0,0.597168,0,0,1.0,0.536008,0.27606,3.0,0,0,1,0,0,0,1
4,0.0,0.963379,1,0,1.0,0.549349,0.15693,2.0,0,0,0,1,0,1,0


In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)

CV = cross_validate(rf, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.8656663029364868
f1 = 0.8734046858076084


In [4]:
from sklearn.linear_model import LogisticRegression

cls = LogisticRegression(max_iter=1000)
CV = cross_validate(cls, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.7870814459543477
f1 = 0.7962077909993641


In [5]:
from sklearn.svm import SVC

svc = SVC(C = 100, gamma = 1, kernel = "rbf")
CV = cross_validate(svc, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.9017721272717276
f1 = 0.906471194472276


In [6]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes = (15, 10), max_iter = 1000)
CV = cross_validate(mlp, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.8425234415642089
f1 = 0.8522716060066087


In [7]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB(var_smoothing = 0.001)
CV = cross_validate(gnb, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.7272169565934554
f1 = 0.7781942324493178


In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 3, weights = 'distance')
CV = cross_validate(gnb, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.7272169565934554
f1 = 0.7781942324493178


In [9]:
from sklearn.ensemble import StackingClassifier
base_models = [
    ('rf', RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)),  # Random Forest
    ('cls', LogisticRegression(max_iter=1000)),  # Logistic Regression
    ('svc', SVC(C = 100, gamma = 1, kernel = "rbf")),  # SVM with linear kernel
    ('gnb', GaussianNB(var_smoothing = 0.001)),  # Gaussian Naive Bayes
    ('knn', KNeighborsClassifier(metric = 'manhattan', n_neighbors = 3, weights = 'distance')),  # k-Nearest Neighbors
    ('mlp', MLPClassifier(hidden_layer_sizes = (15, 10), max_iter = 1000))  # Multilayer Perceptron
]
final_estimator = RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=final_estimator)
CV = cross_validate(stacking_classifier, x_os, y_os, cv=10, scoring=['accuracy', 'f1'])
print('accuracy =', CV['test_accuracy'].mean())
print('f1 =', CV['test_f1'].mean())

accuracy = 0.9330409746277052
f1 = 0.9343115562116943


In [4]:
#prepare
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
rf = RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)
cls = LogisticRegression(max_iter=1000)
svc = SVC(C = 100, gamma = 1, kernel = "rbf")
mlp = MLPClassifier(hidden_layer_sizes = (15, 10), max_iter = 1000)
gnb = GaussianNB(var_smoothing = 0.001)
knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 3, weights = 'distance')
base_models = [
    ('rf', RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)),  # Random Forest
    ('cls', LogisticRegression(max_iter=1000)),  # Logistic Regression
    ('svc', SVC(C = 100, gamma = 1, kernel = "rbf")),  # SVM with linear kernel
    ('gnb', GaussianNB(var_smoothing = 0.001)),  # Gaussian Naive Bayes
    ('knn', KNeighborsClassifier(metric = 'manhattan', n_neighbors = 3, weights = 'distance')),  # k-Nearest Neighbors
    ('mlp', MLPClassifier(hidden_layer_sizes = (15, 10), max_iter = 1000))  # Multilayer Perceptron
]
final_estimator = RandomForestClassifier(max_depth = 10, n_estimators = 14, random_state=42)
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=final_estimator)
#fitting
rf = rf.fit(x_os,y_os)
cls = cls.fit(x_os,y_os)
svc = svc.fit(x_os,y_os)
mlp = mlp.fit(x_os,y_os)
gnb = gnb.fit(x_os,y_os)
knn = knn.fit(x_os,y_os)
stacking_classifier = stacking_classifier.fit(x_os,y_os)

In [10]:
from joblib import dump

dump(rf, 'models/rf_model.joblib')
dump(cls, 'models/cls_model.joblib')
dump(svc, 'models/svc_model.joblib')
dump(mlp, 'models/mlp_model.joblib')
dump(gnb, 'models/gnb_model.joblib')
dump(knn, 'models/knn_model.joblib')
dump(stacking_classifier, 'models/stacking_classifier_model.joblib')

dump(scaler, 'scaler/scaler.joblib')

dump(encoder, 'encoder/encoder.joblib')

['encoder/encoder.joblib']