In [63]:
import numpy as np
import pandas as pd
import category_encoders as ce

from sklearn.model_selection import train_test_split, KFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

In [3]:
def handle_data(csv):
    df = pd.read_csv(csv)
    
    # Encode gender
    df = df[df.gender != "Other"]
    one_hot = ce.OneHotEncoder()
    gender_encoded = one_hot.fit_transform(df.gender)
    df = df.join(gender_encoded)
    del df["gender"]
    
    # Encode work_type, Residence_type, smoking_status features
    target_encoder = ce.TargetEncoder(cols=["work_type", "Residence_type", "smoking_status"])
    df = target_encoder.fit_transform(df, df.stroke)
    
    # fill None values with mean
    df.bmi = df.bmi.fillna(df.bmi.mean())
    
    # Encode ever_marries feature
    df.ever_married = df.ever_married.replace(["Yes", "No"], [1, 0])
    
    # Scaling
    cols_to_scale = ["age", "avg_glucose_level", "bmi"]
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df

In [4]:
def split_data(df):
    features = df.loc[:, df.columns != "stroke"]
    target = df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [5]:
def balance_splitting(df, rows_to_use):
    zero_stroke = df[df.stroke == 0].sample(rows_to_use)
    new_df = df[df.stroke == 1].append(zero_stroke, ignore_index=True)
    features = new_df.loc[:, new_df.columns != "stroke"]
    target = new_df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [6]:
path = "healthcare.csv"
data = handle_data(path)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [7]:
x_train, x_test, y_train, y_test = split_data(data)

In [8]:
x_train_balanced, x_test_balanced, y_train_balanced, y_test_balanced = balance_splitting(data, 400)

In [9]:
knn = KNeighborsClassifier()
knn

KNeighborsClassifier()

In [11]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [12]:
predict =knn.predict(x_test)

In [13]:
accuracy_score(y_test, predict)

0.9374021909233177

In [14]:
confusion_matrix(y_test, predict)

array([[1198,    0],
       [  80,    0]], dtype=int64)

In [16]:
tn, fp,fn, tp = confusion_matrix(y_test, predict).ravel()
(tn, fp, fn, tp)

(1198, 0, 80, 0)

In [17]:
y_test.value_counts()

0    1198
1      80
Name: stroke, dtype: int64

In [18]:
knn.fit(x_train_balanced, y_train_balanced)

KNeighborsClassifier()

In [19]:
predict= knn.predict(x_test_balanced)

In [20]:
accuracy_score(y_test_balanced, predict)

0.5337423312883436

In [21]:
confusion_matrix(y_test_balanced, predict)


array([[71, 26],
       [50, 16]], dtype=int64)

In [23]:
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
tn, fp, fn, tp

(71, 26, 50, 16)

In [24]:
y_test_balanced.value_counts()

0    97
1    66
Name: stroke, dtype: int64

In [25]:
predict_2 = knn.predict(x_test)

In [26]:
accuracy_score(y_test, predict_2)

0.6651017214397497

In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, predict_2).ravel()
tn, fp, fn, tp

(818, 380, 48, 32)

In [29]:
y_test.value_counts()

0    1198
1      80
Name: stroke, dtype: int64

In [30]:
knn = KNeighborsClassifier(20)

In [31]:
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [33]:
predict = knn.predict(x_test)

In [34]:
accuracy_score(y_test, predict)

0.9374021909233177

In [36]:
tn, fp, fn, tp = confusion_matrix(y_test,predict).ravel()
(tn, fp, fn, tp)

(1198, 0, 80, 0)

In [43]:
knn = KNeighborsClassifier(10)
knn.fit(x_train_balanced, y_train_balanced)
predict = knn.predict(x_test_balanced)
print(accuracy_score(y_test_balanced, predict))
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
(tn, fp, fn, tp)

0.5460122699386503


(83, 14, 60, 6)

In [44]:
knn = KNeighborsClassifier()
knn.fit(x_train_balanced, y_train_balanced)
predict = knn.predict(x_test_balanced)
print(accuracy_score(y_test_balanced, predict))
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
(tn, fp, fn, tp)

0.5337423312883436


(71, 26, 50, 16)

In [45]:
knn = KNeighborsClassifier(8)
knn.fit(x_train_balanced, y_train_balanced)
predict = knn.predict(x_test_balanced)
print(accuracy_score(y_test_balanced, predict))
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
(tn, fp, fn, tp)

0.5521472392638037


(86, 11, 62, 4)

In [54]:
def knn_classifier(neighbors, distance_metric, weights = 'distance'):
    knn = KNeighborsClassifier(neighbors, metric = distance_metric, weights = 'distance')
    knn.fit(x_train_balanced, y_train_balanced)
    predict = knn.predict(x_test_balanced)
    print(accuracy_score(y_test_balanced, predict))
    tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
    return accuracy_score(y_test_balanced, predict),(tn, fp, fn, tp)

In [55]:
accuracy, confusion = knn_classifier(10, 'manhattan')
print(accuracy)
print(confusion)

0.5030674846625767
0.5030674846625767
(65, 32, 49, 17)


In [56]:
accuracy, confusion = knn_classifier(8, 'manhattan')
print(accuracy)
print(confusion)

0.5214723926380368
0.5214723926380368
(67, 30, 48, 18)


In [57]:
accuracy, confusion = knn_classifier(20, 'manhattan')
print(accuracy)
print(confusion)

0.5276073619631901
0.5276073619631901
(70, 27, 50, 16)


In [58]:
accuracy, confusion = knn_classifier(3, 'manhattan')
print(accuracy)
print(confusion)

0.5398773006134969
0.5398773006134969
(66, 31, 44, 22)


In [59]:
y_test_balanced.value_counts()

0    97
1    66
Name: stroke, dtype: int64

In [60]:
kf = KFold()
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [68]:
features = np.array(data.loc[:, data.columns!= 'stroke'])
target= np.array(data.stroke)

for train_index, test_index in kf.split(features):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]
    knn = KNeighborsClassifier(3, metric='manhattan', weights='distance')
    knn.fit(x_train, y_train)
    predict = knn.predict(x_test)
    print(accuracy_score(y_test, predict))
    tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
    print(tn, fp, fn, tp)

0.7563600782778865
773 0 249 0
0.974559686888454
996 26 0 0
0.9735812133072407
995 27 0 0
0.9794520547945206
1001 21 0 0
0.9666993143976493
987 34 0 0


In [None]:
kf.split

In [66]:
rkf =RepeatedKFold(n_repeats=2)
rkf

RepeatedKFold(n_repeats=2, n_splits=5, random_state=None)

In [69]:
features = np.array(data.loc[:, data.columns!= 'stroke'])
target= np.array(data.stroke)

for train_index, test_index in rkf.split(features):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]
    knn = KNeighborsClassifier(3, metric='manhattan', weights='distance')
    knn.fit(x_train, y_train)
    predict = knn.predict(x_test)
    print(accuracy_score(y_test, predict))
    tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
    print(tn, fp, fn, tp)

0.9315068493150684
951 18 52 1
0.9275929549902152
946 24 50 2
0.9363992172211351
956 14 51 1
0.9403131115459883
961 18 43 0
0.9275220372184133
946 26 48 1
0.9403131115459883
959 24 37 2
0.9266144814090019
947 18 57 0
0.9334637964774951
953 15 53 1
0.9354207436399217
956 19 47 0
0.9333986287952988
950 19 49 3
