In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

In [2]:
! pip install imbalanced-learn



In [3]:
en_dataset = pd.read_csv('clean_data/StrokeDataEncoded.csv')
en_dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,1,3,0,202.21,29.035926,2,1
1,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
2,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
3,1,81.0,0,0,1,2,1,186.21,29.000000,1,1
4,0,69.0,0,0,0,2,1,94.39,22.800000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5028,0,80.0,1,0,1,2,1,83.75,29.035926,2,0
5029,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5030,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5031,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [4]:
from imblearn.over_sampling import SMOTE
#Define indpendent and dependent variables - and remove the variable to be predicted 
X = en_dataset.drop('stroke', axis=1)
y = en_dataset['stroke']
smote = SMOTE()
X,y = smote.fit_resample(X,y)

In [5]:
print(X.shape)
print(y.shape)

(9674, 10)
(9674,)


In [6]:
y.value_counts()

1    4837
0    4837
Name: stroke, dtype: int64

In [7]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,61.000000,0,0,1,3,0,202.210000,29.035926,2
1,1,80.000000,0,1,1,2,0,105.920000,32.500000,2
2,0,49.000000,0,0,1,2,1,171.230000,34.400000,3
3,1,81.000000,0,0,1,2,1,186.210000,29.000000,1
4,0,69.000000,0,0,0,2,1,94.390000,22.800000,2
...,...,...,...,...,...,...,...,...,...,...
9669,0,68.157456,0,0,1,2,0,211.287261,39.153041,0
9670,0,79.630000,0,0,1,2,1,76.338750,33.081517,1
9671,1,78.535262,1,0,1,2,0,75.159421,28.594684,1
9672,0,75.315225,0,0,1,0,0,62.508370,29.035926,0


In [8]:
smote_dataset = pd.concat([X,y], axis=1)

In [9]:
smote_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,1,3,0,202.21,29.035926,2,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,1,81.0,0,0,1,2,1,186.21,29.0,1,1
4,0,69.0,0,0,0,2,1,94.39,22.8,2,1


In [10]:
X = smote_dataset.drop('stroke', axis=1)
y = smote_dataset['stroke']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7739, 10)
(1935, 10)
(7739,)
(1935,)


In [12]:
print(f'Total # of sample in whole dataset: {len(X)}')
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Total # of sample in train dataset: {len(X_test)}')

Total # of sample in whole dataset: 9674
Total # of sample in train dataset: 7739
Total # of sample in train dataset: 1935


In [13]:
#The data should be scaled due to the large variance amongst numerical features
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [14]:
#train the model on train set (linear)
svc_model_l = SVC(kernel='linear')
svc_model_l.fit(X_train_scaled, y_train)

#print prediction results
predictions = svc_model_l.predict(X_test_scaled)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.8149870801033592
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       978
           1       0.81      0.82      0.81       957

    accuracy                           0.81      1935
   macro avg       0.82      0.82      0.81      1935
weighted avg       0.82      0.81      0.81      1935



In [15]:
print(f'Training Score: {svc_model_l.score(X_train_scaled, y_train)}')
print(f'Testing Score: {svc_model_l.score(X_test_scaled, y_test)}')

Training Score: 0.81173278201318
Testing Score: 0.8149870801033592


In [16]:
#train the model on train set
svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)

#print prediction results
predictions = svc_model.predict(X_test_scaled)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.8697674418604651
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       978
           1       0.85      0.89      0.87       957

    accuracy                           0.87      1935
   macro avg       0.87      0.87      0.87      1935
weighted avg       0.87      0.87      0.87      1935



In [17]:
print(f'Training Score: {svc_model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {svc_model.score(X_test_scaled, y_test)}')

Training Score: 0.8744023775681613
Testing Score: 0.8697674418604651


## GridSearch Optimization

In [18]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(svc_model, param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.854 total time=   5.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.851 total time=   5.2s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.852 total time=   5.2s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.865 total time=  12.6s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.861 total time=   7.8s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.841 total time=   4.2s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.852 total time=   4.2s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.837 total time=   3.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.844 total time=   3.2s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.849 total time=   4.3s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.820 total time=  12.1s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.930 total time=  11.8s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.917 total time=  10.4s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.910 total time=   8.6s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.927 total time=   4.5s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.908 total time=   9.8s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.917 total time=  11.1s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.913 total time=  10.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.899 total time=   9.3s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.910 total time=   8.8s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.865 total time=  11.5s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.878 total time=  18.7s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.864 total time=  10.1s
[CV 4/5] END ....C=1000, gam

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [19]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=100, gamma=1)


In [20]:
grid_predictions = grid.predict(X_test_scaled)
 
# print classification report
print(accuracy_score(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

0.9286821705426357
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       978
           1       0.91      0.95      0.93       957

    accuracy                           0.93      1935
   macro avg       0.93      0.93      0.93      1935
weighted avg       0.93      0.93      0.93      1935



In [21]:
print(f'Training Score: {grid.score(X_train_scaled, y_train)}')
print(f'Testing Score: {grid.score(X_test_scaled, y_test)}')

Training Score: 0.9859154929577465
Testing Score: 0.9286821705426357
