In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

In [2]:
en_dataset = pd.read_csv('StrokeDataEncoded.csv')
en_dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,1,3,0,202.21,29.035926,2,1
1,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
2,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
3,1,81.0,0,0,1,2,1,186.21,29.000000,1,1
4,0,69.0,0,0,0,2,1,94.39,22.800000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5028,0,80.0,1,0,1,2,1,83.75,29.035926,2,0
5029,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5030,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5031,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [3]:
from imblearn.over_sampling import SMOTE
#Define indpendent and dependent variables - and remove the variable to be predicted 
X = en_dataset.drop('stroke', axis=1)
y = en_dataset['stroke']
smote = SMOTE()
X,y = smote.fit_resample(X,y)

In [4]:
print(X.shape)
print(y.shape)

(9674, 10)
(9674,)


In [5]:
y.value_counts()

1    4837
0    4837
Name: stroke, dtype: int64

In [6]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,61.000000,0,0,1,3,0,202.210000,29.035926,2
1,1,80.000000,0,1,1,2,0,105.920000,32.500000,2
2,0,49.000000,0,0,1,2,1,171.230000,34.400000,3
3,1,81.000000,0,0,1,2,1,186.210000,29.000000,1
4,0,69.000000,0,0,0,2,1,94.390000,22.800000,2
...,...,...,...,...,...,...,...,...,...,...
9669,0,2.533129,0,0,0,3,0,69.974639,29.363164,0
9670,1,57.456556,0,0,1,2,1,71.202717,28.581820,0
9671,0,80.175200,0,0,1,2,0,75.816293,22.410239,2
9672,0,51.273425,0,0,1,0,0,102.841224,27.464055,1


In [7]:
smote_dataset = pd.concat([X,y], axis=1)

In [8]:
smote_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,1,3,0,202.21,29.035926,2,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,1,81.0,0,0,1,2,1,186.21,29.0,1,1
4,0,69.0,0,0,0,2,1,94.39,22.8,2,1


In [9]:
X = smote_dataset.drop('stroke', axis=1)
y = smote_dataset['stroke']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7739, 10)
(1935, 10)
(7739,)
(1935,)


In [11]:
print(f'Total # of sample in whole dataset: {len(X)}')
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Total # of sample in train dataset: {len(X_test)}')

Total # of sample in whole dataset: 9674
Total # of sample in train dataset: 7739
Total # of sample in train dataset: 1935


In [12]:
#The data should be scaled due to the large variance amongst numerical features
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [13]:
# import libraries needed
from sklearn.metrics import confusion_matrix, classification_report, f1_score

from sklearn.tree import DecisionTreeClassifier

In [14]:
dt = DecisionTreeClassifier(random_state=101)
dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)

In [15]:
predictions

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))


[[881  97]
 [ 68 889]]

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       978
           1       0.90      0.93      0.92       957

    accuracy                           0.91      1935
   macro avg       0.91      0.91      0.91      1935
weighted avg       0.92      0.91      0.91      1935


F1 score:  0.9150797735460628


In [17]:
dt = DecisionTreeClassifier(splitter='random', random_state=101)
dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[882  96]
 [ 69 888]]

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       978
           1       0.90      0.93      0.91       957

    accuracy                           0.91      1935
   macro avg       0.91      0.91      0.91      1935
weighted avg       0.92      0.91      0.91      1935

F1 score:  0.9149922720247295


In [18]:
# Maybe criterion will change something
dt = DecisionTreeClassifier(criterion='entropy', splitter='random',  max_depth=22, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[872 106]
 [ 78 879]]

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       978
           1       0.89      0.92      0.91       957

    accuracy                           0.90      1935
   macro avg       0.91      0.91      0.90      1935
weighted avg       0.91      0.90      0.90      1935

F1 score:  0.9052523171987642


In [19]:
dt = DecisionTreeClassifier(criterion='gini', splitter='random',  max_depth=30, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[882  96]
 [ 69 888]]

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       978
           1       0.90      0.93      0.91       957

    accuracy                           0.91      1935
   macro avg       0.91      0.91      0.91      1935
weighted avg       0.92      0.91      0.91      1935

F1 score:  0.9149922720247295


In [20]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=22,
                       max_features=None, max_leaf_nodes=30,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0,
                       random_state=98, splitter='best')

dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[801 177]
 [120 837]]

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       978
           1       0.83      0.87      0.85       957

    accuracy                           0.85      1935
   macro avg       0.85      0.85      0.85      1935
weighted avg       0.85      0.85      0.85      1935

F1 score:  0.8493150684931509


In [21]:
# Maybe min_samples_leaf will improve the algorithm
def min_samples_leaf_accuracy():
    min_samples_leaf_train = {}
    min_samples_leaf_test = {}
    for i in range(1, 30):
        dt = DecisionTreeClassifier(splitter='random', max_depth=22, min_samples_split=2, min_samples_leaf=i, random_state=101)
        dt.fit(X_train, y_train)
        
        predictions = dt.predict(X_train)
        f1 = f1_score(y_train, predictions)
        min_samples_leaf_train[i] = f1
        
        predictions = dt.predict(X_test)
        f1 = f1_score(y_test, predictions)
        min_samples_leaf_test[i] = f1
    
    return min_samples_leaf_train, min_samples_leaf_test

min_samples_leaf_train, min_samples_leaf_test = min_samples_leaf_accuracy()

In [22]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[801 177]
 [120 837]]

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       978
           1       0.83      0.87      0.85       957

    accuracy                           0.85      1935
   macro avg       0.85      0.85      0.85      1935
weighted avg       0.85      0.85      0.85      1935

F1 score:  0.8493150684931509


In [25]:
#Will need this dependency in order to oversample our dataset
#! pip install imbalanced-learn 
from imblearn.over_sampling import SMOTE

In [27]:
#Check size of our target feature
smote_dataset['stroke'].value_counts()

1    4837
0    4837
Name: stroke, dtype: int64

In [29]:
#Define indpendent and dependent variables - and remove the variable to be predicted 
X= smote_dataset.drop('stroke',axis=1)
y= smote_dataset['stroke']

In [30]:
#Because our data is not balanced, we will oversample it in order to have equal rows for the minority output (1 stroke)
smote = SMOTE()
X,y = smote.fit_resample(X,y)   

In [31]:
print(X.shape)
print(y.shape)

(9674, 10)
(9674,)


In [32]:
y.value_counts()

1    4837
0    4837
Name: stroke, dtype: int64

In [33]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,61.000000,0,0,1,3,0,202.210000,29.035926,2
1,1,80.000000,0,1,1,2,0,105.920000,32.500000,2
2,0,49.000000,0,0,1,2,1,171.230000,34.400000,3
3,1,81.000000,0,0,1,2,1,186.210000,29.000000,1
4,0,69.000000,0,0,0,2,1,94.390000,22.800000,2
...,...,...,...,...,...,...,...,...,...,...
9669,0,2.533129,0,0,0,3,0,69.974639,29.363164,0
9670,1,57.456556,0,0,1,2,1,71.202717,28.581820,0
9671,0,80.175200,0,0,1,2,0,75.816293,22.410239,2
9672,0,51.273425,0,0,1,0,0,102.841224,27.464055,1


In [34]:
smote_dataset= pd.concat([X,y],axis =1)
smote_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,1,3,0,202.21,29.035926,2,1
1,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,1,81.0,0,0,1,2,1,186.21,29.0,1,1
4,0,69.0,0,0,0,2,1,94.39,22.8,2,1


In [35]:
y = smote_dataset['stroke'].values
X = smote_dataset.drop(['stroke'], axis=1)
print (X.shape)
print (y.shape)

(9674, 10)
(9674,)


In [36]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (7739, 10)
Training Labels Shape: (7739,)
Testing Features Shape: (1935, 10)
Testing Labels Shape: (1935,)


In [38]:
# DECISION TREE - SMOTE

dt = DecisionTreeClassifier(random_state=101)
dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)


In [39]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print()
print('F1 score: ', f1_score(y_test, predictions))

[[465 503]
 [510 457]]

              precision    recall  f1-score   support

           0       0.48      0.48      0.48       968
           1       0.48      0.47      0.47       967

    accuracy                           0.48      1935
   macro avg       0.48      0.48      0.48      1935
weighted avg       0.48      0.48      0.48      1935


F1 score:  0.47431240269849506


In [40]:
dt = DecisionTreeClassifier(splitter='random', random_state=101)
dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[478 490]
 [473 494]]

              precision    recall  f1-score   support

           0       0.50      0.49      0.50       968
           1       0.50      0.51      0.51       967

    accuracy                           0.50      1935
   macro avg       0.50      0.50      0.50      1935
weighted avg       0.50      0.50      0.50      1935

F1 score:  0.5064069707842132


In [41]:
# Maybe criterion will change something
dt = DecisionTreeClassifier(criterion='entropy', splitter='random',  max_depth=22, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[853 115]
 [ 38 929]]

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       968
           1       0.89      0.96      0.92       967

    accuracy                           0.92      1935
   macro avg       0.92      0.92      0.92      1935
weighted avg       0.92      0.92      0.92      1935

F1 score:  0.923918448533068


In [43]:
dt = DecisionTreeClassifier(criterion='gini', splitter='random',  max_depth=30, min_samples_split=2, 
                            min_samples_leaf=1, random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[873  95]
 [ 55 912]]

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       968
           1       0.91      0.94      0.92       967

    accuracy                           0.92      1935
   macro avg       0.92      0.92      0.92      1935
weighted avg       0.92      0.92      0.92      1935

F1 score:  0.9240121580547112


In [44]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=22,
                       max_features=None, max_leaf_nodes=30,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0,
                       random_state=98, splitter='best')

dt.fit(X_train_scaled, y_train)
predictions = dt.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[609 359]
 [591 376]]

              precision    recall  f1-score   support

           0       0.51      0.63      0.56       968
           1       0.51      0.39      0.44       967

    accuracy                           0.51      1935
   macro avg       0.51      0.51      0.50      1935
weighted avg       0.51      0.51      0.50      1935

F1 score:  0.4418331374853114


In [45]:
# Maybe min_samples_leaf will improve the algorithm
def min_samples_leaf_accuracy():
    min_samples_leaf_train = {}
    min_samples_leaf_test = {}
    for i in range(1, 30):
        dt = DecisionTreeClassifier(splitter='random', max_depth=22, min_samples_split=2, min_samples_leaf=i, random_state=101)
        dt.fit(X_train, y_train)
        
        predictions = dt.predict(X_train)
        f1 = f1_score(y_train, predictions)
        min_samples_leaf_train[i] = f1
        
        predictions = dt.predict(X_test)
        f1 = f1_score(y_test, predictions)
        min_samples_leaf_test[i] = f1
    
    return min_samples_leaf_train, min_samples_leaf_test

min_samples_leaf_train, min_samples_leaf_test = min_samples_leaf_accuracy()

In [46]:
print(confusion_matrix(y_test, predictions))
print()
print(classification_report(y_test, predictions))
print('F1 score: ', f1_score(y_test, predictions))

[[609 359]
 [591 376]]

              precision    recall  f1-score   support

           0       0.51      0.63      0.56       968
           1       0.51      0.39      0.44       967

    accuracy                           0.51      1935
   macro avg       0.51      0.51      0.50      1935
weighted avg       0.51      0.51      0.50      1935

F1 score:  0.4418331374853114


In [49]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [52]:
X_train = X[:-20]
X_test = X[-20:]
y_train = y[:-20]
y_test = y[-20:]
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [53]:
y_predict = linreg.predict(X_test)
print(mean_squared_error(y_test, y_predict))

0.1844749095592499


In [57]:
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train),
                mean_squared_error(y_test, y_test)))

MSE train: 0.000, test: 0.000


In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
#params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
#grid_search_cv = GridSearchCV(dt(random_state=42), params, verbose=1, cv=3)
#grid_search_cv.fit(X_train_scaled, y_train)

TypeError: 'DecisionTreeClassifier' object is not callable

In [None]:
#dt = DecisionTreeClassifier(criterion='gini', splitter='random',  max_depth=22, min_samples_split=2, 
                                min_samples_leaf=1, random_state=101)
#dt.fit(X_train, y_train)
#predictions = dt.predict(X_test)

In [None]:
#print(confusion_matrix(y_test, predictions))
#print()
#print(classification_report(y_test, predictions))
#print('F1 Score: ', f1_score(y_test, predictions))