In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import collections
import math
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import collections
import imblearn.under_sampling
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from keras.models import Sequential
from keras.layers import Activation, Dense, Conv1D
from keras import losses
from tensorflow.keras import optimizers
import tensorflow as tf

import matplotlib.pyplot as plt



In [24]:
df = pd.read_csv('../data/stroke_data_cleanedJB.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,never smoked,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,never smoked,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,children,Rural,161.28,19.1,never smoked,0


In [25]:
df["smoking_status_cardinal"] = np.NaN

In [26]:
df.loc[df["smoking_status"] == "never smoked", "smoking_status_cardinal"] = 0
df.loc[df["smoking_status"] == "formerly smoked", "smoking_status_cardinal"] = 1
df.loc[df["smoking_status"] == "smokes", "smoking_status_cardinal"] = 2

In [27]:
df[['smoking_status', 'smoking_status_cardinal']].head(20)

Unnamed: 0,smoking_status,smoking_status_cardinal
0,never smoked,0.0
1,never smoked,0.0
2,never smoked,0.0
3,formerly smoked,1.0
4,never smoked,0.0
5,,
6,formerly smoked,1.0
7,never smoked,0.0
8,smokes,2.0
9,never smoked,0.0


In [31]:
df.loc[df["gender"] == "Male", "gender"] = 0
df.loc[df["gender"] == "Female", "gender"] = 1

In [30]:
df.loc[df["ever_married"] == "No", "ever_married"] = 0
df.loc[df["ever_married"] == "Yes", "ever_married"] = 1

In [29]:
df.loc[df["Residence_type"] == "Rural", "Residence_type"] = 0
df.loc[df["Residence_type"] == "Urban", "Residence_type"] = 1

In [33]:
df_onehot_worktype = pd.get_dummies(df['work_type'])
df_onehot_worktype.columns = ["worktype_"+col for col in df_onehot_worktype.columns]
df_onehot_worktype

Unnamed: 0,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,0,1
...,...,...,...,...,...
43384,0,0,0,0,1
43385,1,0,0,0,0
43386,0,0,1,0,0
43387,0,0,1,0,0


In [35]:
df = pd.concat([df, df_onehot_worktype], axis=1)

In [37]:
df_imputation = df.drop(columns=["smoking_status", "work_type", "id"])

In [38]:
df_imputation

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_cardinal,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
0,0,3.0,0,0,0,0,95.12,18.0,0,0.0,0,0,0,0,1
1,0,58.0,1,0,1,1,87.96,39.2,0,0.0,0,0,1,0,0
2,1,8.0,0,0,0,1,110.89,17.6,0,0.0,0,0,1,0,0
3,1,70.0,0,0,1,0,69.04,35.9,0,1.0,0,0,1,0,0
4,0,14.0,0,0,0,0,161.28,19.1,0,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43384,1,10.0,0,0,0,1,58.64,20.4,0,0.0,0,0,0,0,1
43385,1,56.0,0,0,1,1,213.61,55.4,0,1.0,1,0,0,0,0
43386,1,82.0,1,0,1,1,91.94,28.9,0,1.0,0,0,1,0,0
43387,0,40.0,0,0,1,1,99.16,33.2,0,0.0,0,0,1,0,0


In [41]:
num_stroke_test = round(sum(df_imputation["stroke"] == 1) / 5)

Generate test dataset before imputation, taking only non NaN values

In [42]:
df_imputation = df_imputation.sample(frac=1, random_state=1).reset_index(drop=True)

In [50]:
index_test_stroke = df_imputation.dropna().loc[df_imputation.dropna()["stroke"] == 1,:].iloc[0:num_stroke_test,:].index
index_test_nonstroke = df_imputation.dropna().loc[df_imputation.dropna()["stroke"] == 0,:].iloc[0:num_stroke_test,:].index

In [52]:
test_dataset = pd.concat([df_imputation.loc[index_test_stroke,:], df_imputation.loc[index_test_nonstroke,:]])
test_dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_cardinal,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
10,0,58.00,0,1,1,0,63.26,29.6,1,0.0,0,0,1,0,0
53,1,78.00,0,0,1,1,81.59,32.3,1,0.0,0,0,1,0,0
59,0,59.00,0,0,1,0,91.59,31.9,1,2.0,0,0,1,0,0
240,1,68.00,0,0,0,1,231.96,36.7,1,2.0,0,0,0,1,0
326,0,79.00,0,0,1,1,87.06,24.3,1,1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,24.00,0,0,0,0,68.13,20.2,0,0.0,0,0,1,0,0
196,1,76.00,0,0,1,1,106.84,27.0,0,1.0,0,0,0,1,0
197,1,0.24,0,0,0,0,89.24,13.8,0,0.0,0,0,0,0,1
198,1,33.00,0,0,0,0,114.27,25.3,0,0.0,1,0,0,0,0


In [55]:
train_dataset = df_imputation.drop(index=index_test_stroke)
train_dataset = train_dataset.drop(index=index_test_nonstroke)
train_dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_cardinal,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
3,1,65.0,1,0,1,0,81.70,29.5,0,,0,0,0,1,0
17,0,17.0,0,0,0,1,82.57,28.1,0,,0,0,1,0,0
18,1,65.0,0,0,1,0,97.54,24.4,0,,1,0,0,0,0
27,1,57.0,0,0,1,1,139.73,,1,2.0,0,0,0,1,0
30,1,68.0,0,1,1,1,235.75,32.2,0,,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43384,0,3.0,0,0,0,1,58.50,21.0,0,0.0,0,0,0,0,1
43385,1,0.4,0,0,0,1,105.52,13.6,0,0.0,0,0,0,0,1
43386,0,60.0,0,0,1,1,243.65,19.4,0,2.0,0,0,1,0,0
43387,0,46.0,0,0,1,1,62.51,24.1,0,,0,0,0,1,0


In [56]:
imp = KNNImputer(n_neighbors=5)
train_dataset_imputed = imp.fit_transform(train_dataset)

In [58]:
df_train_dataset_imputed = pd.DataFrame(train_dataset_imputed, columns=train_dataset.columns)
df_train_dataset_imputed

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_cardinal,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
0,1.0,65.0,1.0,0.0,1.0,0.0,81.70,29.50,0.0,0.6,0.0,0.0,0.0,1.0,0.0
1,0.0,17.0,0.0,0.0,0.0,1.0,82.57,28.10,0.0,0.6,0.0,0.0,1.0,0.0,0.0
2,1.0,65.0,0.0,0.0,1.0,0.0,97.54,24.40,0.0,0.8,1.0,0.0,0.0,0.0,0.0
3,1.0,57.0,0.0,0.0,1.0,1.0,139.73,36.26,1.0,2.0,0.0,0.0,0.0,1.0,0.0
4,1.0,68.0,0.0,1.0,1.0,1.0,235.75,32.20,0.0,0.2,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43070,0.0,3.0,0.0,0.0,0.0,1.0,58.50,21.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43071,1.0,0.4,0.0,0.0,0.0,1.0,105.52,13.60,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43072,0.0,60.0,0.0,0.0,1.0,1.0,243.65,19.40,0.0,2.0,0.0,0.0,1.0,0.0,0.0
43073,0.0,46.0,0.0,0.0,1.0,1.0,62.51,24.10,0.0,0.6,0.0,0.0,0.0,1.0,0.0


In [60]:
df_train_dataset_imputed["gender"].dtype

dtype('float64')

Since imputation for cardinal smoking status gives non-cardinal values, need to round to integer:

In [61]:
df_train_dataset_imputed.loc[:,"smoking_status_cardinal"] = round(df_train_dataset_imputed.loc[:,"smoking_status_cardinal"])
df_train_dataset_imputed.loc[:,"smoking_status_cardinal"] = df_train_dataset_imputed.loc[:,"smoking_status_cardinal"].astype('float64')
df_train_dataset_imputed

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_cardinal,worktype_Govt_job,worktype_Never_worked,worktype_Private,worktype_Self-employed,worktype_children
0,1.0,65.0,1.0,0.0,1.0,0.0,81.70,29.50,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,17.0,0.0,0.0,0.0,1.0,82.57,28.10,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,65.0,0.0,0.0,1.0,0.0,97.54,24.40,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,1.0,57.0,0.0,0.0,1.0,1.0,139.73,36.26,1.0,2.0,0.0,0.0,0.0,1.0,0.0
4,1.0,68.0,0.0,1.0,1.0,1.0,235.75,32.20,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43070,0.0,3.0,0.0,0.0,0.0,1.0,58.50,21.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43071,1.0,0.4,0.0,0.0,0.0,1.0,105.52,13.60,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43072,0.0,60.0,0.0,0.0,1.0,1.0,243.65,19.40,0.0,2.0,0.0,0.0,1.0,0.0,0.0
43073,0.0,46.0,0.0,0.0,1.0,1.0,62.51,24.10,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [64]:
X_train = df_train_dataset_imputed.drop(columns="stroke")
y_train = df_train_dataset_imputed.loc[:, "stroke"]
rus = RandomOverSampler(random_state=1)
X_train_oversampled, y_train_oversampled = rus.fit_resample(X_train, y_train)
print('Ratio of strokes over the total number of train samples is:', y_train_oversampled.sum()/y_train_oversampled.shape[0])

Ratio of strokes over the total number of train samples is: 0.5


In [65]:
train_imputed = pd.concat([X_train_oversampled, y_train_oversampled], axis=1)
train_imputed.shape

(84898, 15)

In [66]:
test_dataset.shape

(314, 15)

In [68]:
train_imputed.to_csv('../data/train_allnumeric_oversampled_imputed_JB.csv', index=False)
test_dataset.to_csv('../data/test_allnumerica_JB.csv', index=False)
