In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer,KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
df = pd.read_csv('train.csv',usecols=['Survived','Pclass','Age','Fare'])
df

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.2500
1,1,1,38.0,71.2833
2,1,3,26.0,7.9250
3,1,1,35.0,53.1000
4,0,3,35.0,8.0500
...,...,...,...,...
886,0,2,27.0,13.0000
887,1,1,19.0,30.0000
888,0,3,,23.4500
889,1,1,26.0,30.0000


In [10]:
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [11]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [15]:
X_train

Unnamed: 0,Pclass,Age,Fare
30,1,40.0,27.7208
10,3,4.0,16.7000
873,3,47.0,9.0000
182,3,9.0,31.3875
876,3,20.0,9.8458
...,...,...,...
534,3,30.0,8.6625
584,3,,8.7125
493,1,71.0,49.5042
527,1,,221.7792


# Mean imputer

In [20]:
mi = SimpleImputer(strategy='mean')

X_train_mi = mi.fit_transform(X_train)
X_test_mi = mi.transform(X_test)

lr = LogisticRegression()

lr.fit(X_train_mi,y_train)

y_pred_mi = lr.predict(X_test_mi)

print(accuracy_score(y_test,y_pred_mi))

0.6927374301675978


# KNN imputer

In [21]:
knn = KNNImputer(n_neighbors=3,weights='distance')

X_train_knn = knn.fit_transform(X_train)
X_test_knn = knn.transform(X_test)

lr = LogisticRegression()

lr.fit(X_train_knn,y_train)

y_pred_knn = lr.predict(X_test_knn)

print(accuracy_score(y_test,y_pred_knn))

0.7150837988826816


# Best KNN

In [30]:

def bestKNNImuter(n):
    nNeibour = [x for x in range(1,n+1)]
    mini = 0
    minK = 0
    for neighbour in nNeibour:
        knn = KNNImputer(n_neighbors=neighbour,weights='distance')
        
        X_train_trf = knn.fit_transform(X_train)
        X_test_trf = knn.transform(X_test)
        lr = LogisticRegression()
        
        lr.fit(X_train_trf,y_train)
        
        y_pred = lr.predict(X_test_trf)
        acc = accuracy_score(y_test,y_pred)
        print(acc," ",neighbour)
        if acc > mini:
            mini = acc
            minK = neighbour
    return minK , mini  


In [32]:
bestForK, accuracy = bestKNNImuter(50)
print(bestForK)
accuracy

0.6871508379888268   1
0.7150837988826816   2
0.7150837988826816   3
0.7094972067039106   4
0.7094972067039106   5
0.7039106145251397   6
0.7039106145251397   7
0.7039106145251397   8
0.7039106145251397   9
0.7039106145251397   10
0.7039106145251397   11
0.7039106145251397   12
0.7039106145251397   13
0.7039106145251397   14
0.7039106145251397   15
0.7039106145251397   16
0.7039106145251397   17
0.7039106145251397   18
0.7039106145251397   19
0.7039106145251397   20
0.7039106145251397   21
0.7039106145251397   22
0.7039106145251397   23
0.7039106145251397   24
0.7039106145251397   25
0.7039106145251397   26
0.7039106145251397   27
0.7039106145251397   28
0.7039106145251397   29
0.7039106145251397   30
0.7039106145251397   31
0.7039106145251397   32
0.7039106145251397   33
0.7039106145251397   34
0.7039106145251397   35
0.7039106145251397   36
0.7039106145251397   37
0.7039106145251397   38
0.7039106145251397   39
0.7039106145251397   40
0.7039106145251397   41
0.7039106145251397   42
0

0.7150837988826816