In [None]:
# pip install pathlib
%matplotlib inline
from pathlib import Path
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [None]:
# set repository directory for data files
# '.' assumes data is kept in same directory as the notebook
DATA = Path('.')

In [None]:
# read in df
stroke_df = pd.read_csv(DATA / 'healthcare-dataset-stroke-data.csv')
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [None]:
stroke_df = stroke_df[stroke_df.smoking_status != 'Unknown']

In [None]:
stroke_df = stroke_df.dropna()

In [None]:
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5100,68398,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,0
5102,45010,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0


In [None]:
# make dummies
stroke_df = pd.get_dummies(stroke_df)
stroke_df

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,0,1,0,0,0,1,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,0,1,0,1,0,0,1,0
5,56669,81.0,0,0,186.21,29.0,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,68398,82.0,1,0,71.97,28.3,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
5102,45010,57.0,0,0,77.93,21.7,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
5106,44873,81.0,0,0,125.20,40.0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
5107,19723,35.0,0,0,82.99,30.6,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0


In [None]:
stroke_df = stroke_df.drop(columns = ['id','ever_married_No', 'ever_married_Yes','Residence_type_Rural'])

In [None]:
list(stroke_df.columns)

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'stroke',
 'gender_Female',
 'gender_Male',
 'gender_Other',
 'work_type_Govt_job',
 'work_type_Never_worked',
 'work_type_Private',
 'work_type_Self-employed',
 'work_type_children',
 'Residence_type_Urban',
 'smoking_status_formerly smoked',
 'smoking_status_never smoked',
 'smoking_status_smokes']

In [None]:
#split into training and validation
train_df, valid_df = train_test_split(stroke_df, test_size=0.4, random_state=1)
print('Training Set: ', train_df.shape, 'Validation Set: ', valid_df.shape)

Training Set:  (2055, 18) Validation Set:  (1371, 18)


In [None]:
# define patient (likely stroke)
patient = pd.DataFrame([{'age':80,
 'hypertension':1,
 'heart_disease':1,
 'avg_glucose_level':200,
 'bmi':40,
 'gender_Female':0,
 'gender_Male':1,
 'gender_Other':0,
 'work_type_Govt_job':1,
 'work_type_Never_worked':0,
 'work_type_Private':0,
 'work_type_Self-employed':0,
 'work_type_children':0,
 'Residence_type_Urban':1,
 'smoking_status_formerly smoked':0,
 'smoking_status_never smoked':0,
 'smoking_status_smokes':1}])
patient

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,80,1,1,200,40,0,1,0,1,0,0,0,0,1,0,0,1


In [None]:
#standardization of both sets
# transformation trained on training set only
outcome = 'stroke'
predictors = list(stroke_df.columns)
predictors.remove(outcome)
scaler = preprocessing.StandardScaler()
scaler.fit(train_df[predictors])

#transform predictors of both sets and newCustomer
train_X = scaler.transform(train_df[predictors])
train_Y = train_df[outcome]
valid_X = scaler.transform(valid_df[predictors])
valid_Y = valid_df[outcome]
patientNorm = scaler.transform(patient)

In [None]:
# kNN using K=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_X, train_Y)

KNeighborsClassifier(n_neighbors=1)

In [None]:
# prediction for patient
knn.predict(patientNorm)

array([1])

In [None]:
# prob of patient saying yes
knn.predict_proba(patientNorm)

array([[0., 1.]])

In [None]:
results = []
for k in range(1,20,2):
    knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_Y)
    results.append({'k': k, 'accuracy': accuracy_score(valid_Y, knn.predict(valid_X))})

results = pd.DataFrame(results)
results

Unnamed: 0,k,accuracy
0,1,0.901532
1,3,0.942378
2,5,0.948213
3,7,0.949672
4,9,0.951131
5,11,0.951131
6,13,0.951131
7,15,0.951131
8,17,0.951131
9,19,0.950401
