# Stroke Prediction Dataset
### 11 clinical features por predicting stroke events

https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

1) id: unique identifier    
2) gender: "Male", "Female" or "Other"    
3) age: age of the patient    
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension    
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease    
6) ever_married: "No" or "Yes"    
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"    
8) Residence_type: "Rural" or "Urban"     
9) avg_glucose_level: average glucose level in blood    
10) bmi: body mass index    
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*    
12) stroke: 1 if the patient had a stroke or 0 if not    
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient    

In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

%matplotlib inline

In [276]:
ds = pd.read_csv('healthcare-dataset-stroke-data.csv', delimiter=',')

In [277]:
ds.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [278]:
ds.drop(['id'], axis=1, inplace=True)

In [279]:
ds.stroke[ds.gender == 'Other']

3116    0
Name: stroke, dtype: int64

In [280]:
ds.drop([3116], inplace=True)

In [281]:
ds.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [282]:
columns_numeric = ['age', 'avg_glucose_level', 'bmi']
columns_categorical = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
columns_dummis = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
columns_target = ['stroke']

In [283]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [284]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [285]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   object 
 1   age                5109 non-null   float64
 2   hypertension       5109 non-null   int64  
 3   heart_disease      5109 non-null   int64  
 4   ever_married       5109 non-null   object 
 5   work_type          5109 non-null   object 
 6   Residence_type     5109 non-null   object 
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                5109 non-null   float64
 9   smoking_status     5109 non-null   object 
 10  stroke             5109 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 639.0+ KB


In [287]:
ds = pd.get_dummies(ds, columns=columns_dummis, prefix_sep='_', drop_first=True)

In [290]:
ds.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,1,1,0,1,0,0,1,1,0,0
1,61.0,0,0,202.21,29.0,1,0,1,0,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,1,1,1,0,1,0,0,0,0,1,0


In [289]:
from sklearn.model_selection import train_test_split

In [292]:
ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, stratify=ds.stroke)

In [294]:
ds_train.stroke.value_counts(normalize=True), ds_test.stroke.value_counts(normalize=True)

(0    0.951342
 1    0.048658
 Name: stroke, dtype: float64,
 0    0.951076
 1    0.048924
 Name: stroke, dtype: float64)

In [265]:
y_train.value_counts(), y_test.value_counts()

(0    3402
 1     174
 Name: stroke, dtype: int64,
 0    1458
 1      75
 Name: stroke, dtype: int64)

In [266]:
add_rows = int(y_train.value_counts()[0] * .8 - y_train.value_counts()[1])
add_rows

2547

In [267]:
int(add_rows / y_train.value_counts()[1]), y_train.value_counts()

(14,
 0    3402
 1     174
 Name: stroke, dtype: int64)

In [268]:
y_train.shape

(3576,)

In [269]:
y_train_zero = y_train[y_train == 0]
y_train_one = y_train[y_train == 1]

In [270]:
y_train_zero.value_counts(), y_train_one.value_counts()

(0    3402
 Name: stroke, dtype: int64,
 1    174
 Name: stroke, dtype: int64)

In [271]:
for i in range(14):
    y_train = y_train.append(y_train_one)
    #y_train = pd.concat([y_train, y_train[y_train == 1]])

In [272]:
y_train.value_counts(normalize=True)

0    0.565868
1    0.434132
Name: stroke, dtype: float64

In [275]:
from sklearn.utils import shuffle

In [257]:
y_train[y_train == 1].shape

(2850816,)

In [124]:
X_tree = X.copy(deep=True)
y_tree = y.copy(deep=True)

### ---------- Random Forest ------------------

In [126]:
from sklearn.ensemble import RandomForestClassifier

In [127]:
model_rf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=1, bootstrap=True, random_state=42, n_jobs=-1)

In [None]:
for _ in ['gini', 'entropy']:
    