In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:

df = pd.read_csv('titanic.csv')[['Age','Pclass','Fare','Survived']]

In [3]:
df.sample(5)

Unnamed: 0,Age,Pclass,Fare,Survived
135,23.0,2,15.0458,0
771,48.0,3,7.8542,0
697,,3,7.7333,1
864,24.0,2,13.0,0
448,5.0,3,19.2583,1


In [4]:
df.isnull().mean() * 100

Age         19.86532
Pclass       0.00000
Fare         0.00000
Survived     0.00000
dtype: float64

In [6]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [8]:
X_train.head()

Unnamed: 0,Age,Pclass,Fare
30,40.0,1,27.7208
10,4.0,3,16.7
873,47.0,3,9.0
182,9.0,3,31.3875
876,20.0,3,9.8458


In [14]:
knn = KNNImputer(n_neighbors=2,weights='distance')

X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [18]:
lr = LogisticRegression()

lr.fit(X_train_trf,y_train)

y_pred = lr.predict(X_test_trf)

accuracy_score(y_test,y_pred) # 0.7150837988826816  

0.7150837988826816

In [13]:
# lets use a loop for multiple K values

for i in range(1, 11):
    knn = KNNImputer(n_neighbors=i,weights='distance')
    X_train_trf = knn.fit_transform(X_train)
    X_test_trf = knn.transform(X_test)

    lr.fit(X_train_trf,y_train)

    y_pred = lr.predict(X_test_trf)

    acuracy = accuracy_score(y_test,y_pred)

    print(f"Accuracy for k={i} is {acuracy}")  # k = 2 looks good here

Accuracy for k=1 is 0.6871508379888268
Accuracy for k=2 is 0.7150837988826816
Accuracy for k=3 is 0.7150837988826816
Accuracy for k=4 is 0.7094972067039106
Accuracy for k=5 is 0.7094972067039106
Accuracy for k=6 is 0.7039106145251397
Accuracy for k=7 is 0.7039106145251397
Accuracy for k=8 is 0.7039106145251397
Accuracy for k=9 is 0.7039106145251397
Accuracy for k=10 is 0.7039106145251397


In [19]:
# lets use a loop for multiple K values with uniform weights

for i in range(1, 11):
    knn = KNNImputer(n_neighbors=i,weights='uniform')
    X_train_trf = knn.fit_transform(X_train)
    X_test_trf = knn.transform(X_test)

    lr.fit(X_train_trf,y_train)

    y_pred = lr.predict(X_test_trf)

    acuracy = accuracy_score(y_test,y_pred)

    print(f"Accuracy for k={i} is {acuracy}")  # distance is giving beter performance than uniform

Accuracy for k=1 is 0.6871508379888268
Accuracy for k=2 is 0.7094972067039106
Accuracy for k=3 is 0.7039106145251397
Accuracy for k=4 is 0.7039106145251397
Accuracy for k=5 is 0.7039106145251397
Accuracy for k=6 is 0.7039106145251397
Accuracy for k=7 is 0.7039106145251397
Accuracy for k=8 is 0.6983240223463687
Accuracy for k=9 is 0.7039106145251397
Accuracy for k=10 is 0.7039106145251397


In [16]:
# Comparision with Simple Imputer --> mean

si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [17]:
lr = LogisticRegression()

lr.fit(X_train_trf2,y_train)

y_pred2 = lr.predict(X_test_trf2)

accuracy_score(y_test,y_pred2)

0.6927374301675978

As we can see, using KNN we got 71.5 and using simple imputer we got accuray as 69.2