# McKinsey Analytics Online Hackathon - Healthcare Analytics

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")

In [3]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [4]:
train['work_type'].describe()

count       43400
unique          5
top       Private
freq        24834
Name: work_type, dtype: object

In [5]:
train['smoking_status'].describe()

count            30108
unique               3
top       never smoked
freq             16053
Name: smoking_status, dtype: object

In [6]:
train.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,43400.0,43400.0,43400.0,43400.0,43400.0,41938.0,43400.0
mean,36326.14235,42.217894,0.093571,0.047512,104.48275,28.605038,0.018041
std,21072.134879,22.519649,0.291235,0.212733,43.111751,7.77002,0.133103
min,1.0,0.08,0.0,0.0,55.0,10.1,0.0
25%,18038.5,24.0,0.0,0.0,77.54,23.2,0.0
50%,36351.5,44.0,0.0,0.0,91.58,27.7,0.0
75%,54514.25,60.0,0.0,0.0,112.07,32.9,0.0
max,72943.0,82.0,1.0,1.0,291.05,97.6,1.0


In [7]:
test = pd.read_csv("test.csv")

In [8]:
test.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,36306,Male,80.0,0,0,Yes,Private,Urban,83.84,21.1,formerly smoked
1,61829,Female,74.0,0,1,Yes,Self-employed,Rural,179.5,26.0,formerly smoked
2,14152,Female,14.0,0,0,No,children,Rural,95.16,21.2,
3,12997,Male,28.0,0,0,No,Private,Urban,94.76,23.4,
4,40801,Female,63.0,0,0,Yes,Govt_job,Rural,83.57,27.6,never smoked


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
sub_ids = test['id']
test = test.drop('id', axis=1)

In [11]:
y = train['stroke']
train = train.drop(['id', 'stroke'], axis=1)

In [12]:
train['bmi'].fillna(train['bmi'].median(),inplace=True)
test['bmi'].fillna(train['bmi'].median(),inplace=True)
train['smoking_status'].fillna(' ', inplace=True)
test['smoking_status'].fillna(' ', inplace=True)

In [13]:
test.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
count,18601.0,18601.0,18601.0,18601.0,18601.0
mean,42.056504,0.093167,0.048062,104.386359,28.518467
std,22.528018,0.290674,0.213903,42.606714,7.634299
min,0.08,0.0,0.0,55.0,10.2
25%,24.0,0.0,0.0,77.55,23.4
50%,43.0,0.0,0.0,91.83,27.7
75%,60.0,0.0,0.0,112.31,32.6
max,82.0,1.0,1.0,275.72,88.3


In [15]:
## encode categorical fields
obj_cols = ['gender','work_type','Residence_type','ever_married','smoking_status']
encoder = LabelEncoder()
for x in obj_cols:
    encoder.fit(train[x])
    train[x] = encoder.transform(train[x])
    test[x] = encoder.transform(test[x])

In [16]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train)
train_std = sc.transform(train)
test_std = sc.transform(test)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

model = KNeighborsClassifier()
print(model)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


In [26]:
n_neighbors = [70,80,90,100,110,120]
for i in n_neighbors:
    print("N Neighbors: ",i)
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(train,y)
    preds = model.predict_proba(train)
    print(preds[:,1])
    print("ROC AUC Score: ",roc_auc_score(y,preds[:,1]))
    preds = model.predict_proba(test)
    sub = pd.DataFrame({'id':sub_ids,'stroke':preds[:,1]})
    filename='submission_KNN_scaled_N='+str(i)+'.csv'
    sub.to_csv(filename,index=False)
    from IPython.display import FileLink
    FileLink(filename)

N Neighbors:  70
[0.         0.04285714 0.         ... 0.04285714 0.         0.11428571]
ROC AUC Score:  0.8798822809513864
N Neighbors:  80
[0.     0.0625 0.     ... 0.05   0.     0.1   ]
ROC AUC Score:  0.8769394545752208
N Neighbors:  90
[0.         0.05555556 0.         ... 0.04444444 0.         0.08888889]
ROC AUC Score:  0.8736859516575075
N Neighbors:  100
[0.   0.05 0.   ... 0.06 0.   0.1 ]
ROC AUC Score:  0.8712056638248469
N Neighbors:  110
[0.         0.04545455 0.         ... 0.05454545 0.00909091 0.10909091]
ROC AUC Score:  0.8701892897296544
N Neighbors:  120
[0.         0.04166667 0.         ... 0.05       0.00833333 0.1       ]
ROC AUC Score:  0.8703197696816076
