# KNN Imputer (multivariate)

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# load dataset using seaborn
df = sns.load_dataset('titanic')[['age', 'pclass', 'fare', 'survived']]

In [3]:
# display the first few rows of the dataframe
df.head()

Unnamed: 0,age,pclass,fare,survived
0,22.0,3,7.25,0
1,38.0,1,71.2833,1
2,26.0,3,7.925,1
3,35.0,1,53.1,1
4,35.0,3,8.05,0


In [4]:
# check for missing values in each column
df.isnull().mean()*100

age         19.86532
pclass       0.00000
fare         0.00000
survived     0.00000
dtype: float64

In [5]:
# split the data into features and target variable
X = df.drop(columns=['survived'])
y = df['survived']

In [6]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=2)

In [72]:
X_train.head()

Unnamed: 0,age,pclass,fare
30,40.0,1,27.7208
10,4.0,3,16.7
873,47.0,3,9.0
182,9.0,3,31.3875
876,20.0,3,9.8458


In [7]:
# apply KNN imputer
knn = KNNImputer(n_neighbors=4, weights='distance')
X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [8]:
# convert the transformed training data back to a DataFrame
pd.DataFrame(X_train_trf, columns=X_train.columns)

Unnamed: 0,age,pclass,fare
0,40.000000,1.0,27.7208
1,4.000000,3.0,16.7000
2,47.000000,3.0,9.0000
3,9.000000,3.0,31.3875
4,20.000000,3.0,9.8458
...,...,...,...
707,30.000000,3.0,8.6625
708,24.845930,3.0,8.7125
709,71.000000,1.0,49.5042
710,31.776658,1.0,221.7792


In [9]:
# train a logistic regression model
lr = LogisticRegression()
lr.fit(X_train_trf, y_train)

# make predictions and evaluate the model
y_pred = lr.predict(X_test_trf)
accuracy_score(y_test, y_pred)

0.7039106145251397

In [11]:
# comparison with simple imputer
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [13]:
# train logistic regression model
lr = LogisticRegression()
lr.fit(X_train_trf2, y_train)

# make predictions and evaluate the model
y_pred2 = lr.predict(X_test_trf2)
# accuracy score
accuracy_score(y_test, y_pred2)

0.6927374301675978