In [49]:
## importing the data

import pandas as pd
stroke = pd.read_csv('healthcare-dataset-stroke-data.csv')

## checking for missing values
stroke.isna().sum()
## decided to drop the missing values
stroke.dropna(inplace=True)
## rechecking for the missing values
stroke.isna().sum()
## looking at the new shape of my data
stroke.shape

(4909, 12)

In [13]:
## decided to rename one of the columns wrongly spelt
stroke.rename(columns=str.lower, inplace=True)

In [34]:
## prepping my data into features and labels
X = stroke.drop('stroke', axis=1)
y = stroke['stroke']

In [35]:
## importing columntransformer and OneHotEncoder for easy conversion of strings to number
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## outlining the columns I would want transformed
ini_features = ['gender', 'ever_married', 'work_type', 'residence_type', 'smoking_status']
## instatiating my onehotencoder
OneHot = OneHotEncoder()
## instatiating my columntransformer, passing others unneeded varables as passthrough
transform = ColumnTransformer([('OneHot', OneHot, ini_features)], remainder='passthrough')
##finally fitting my features var (X) into a fully transformed var. Need this for the outlined var to know where to inherit from.
transformed_X = transform.fit_transform(X) 





In [37]:
## further splitting of the feature and label into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2) 

In [61]:
## finally fitting the model and checking out the score............
from sklearn.ensemble import RandomForestClassifier
import numpy as np
np.random.seed(0)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9582484725050916

In [50]:
## another way to look at this is compare the y_preds and y_test
clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)
#y_preds

In [43]:
## further analysis
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(classification_report(y_preds, y_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       980
           1       0.00      0.00      0.00         2

    accuracy                           0.96       982
   macro avg       0.50      0.48      0.49       982
weighted avg       1.00      0.96      0.98       982



In [44]:
## further analysis
print(confusion_matrix(y_preds, y_test))

[[941  39]
 [  2   0]]


In [45]:
## further analysis
print(accuracy_score(y_preds, y_test))

0.9582484725050916


In [59]:
## evaluating and trying to what estimator works best
np.random.seed(0)
for i in range(10,100,10):
    print(f'When estimator is set at {i}')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'We get: {clf.score(X_test, y_test)*100:.2f}%')
    print('')

When estimator is set at 10
We get: 94.91%

When estimator is set at 20
We get: 95.82%

When estimator is set at 30
We get: 95.82%

When estimator is set at 40
We get: 95.82%

When estimator is set at 50
We get: 95.82%

When estimator is set at 60
We get: 95.62%

When estimator is set at 70
We get: 95.93%

When estimator is set at 80
We get: 96.03%

When estimator is set at 90
We get: 95.82%



In [65]:
## saving our model for future use
import pickle
pickle.dump(clf, open('stroke_detector_model.pkl', 'wb'))

In [67]:
## loading the model
loading_model = pickle.load(open('stroke_detector_model.pkl', 'rb'))
loading_model.score(X_test, y_test)

0.9582484725050916