In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer,MissingIndicator
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [162]:
df = pd.read_csv("train.csv",usecols=['Age','Fare','Survived'])
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
886,0,27.0,13.0000
887,1,19.0,30.0000
888,0,,23.4500
889,1,26.0,30.0000


In [163]:
df.isnull().mean()

Survived    0.000000
Age         0.198653
Fare        0.000000
dtype: float64

In [164]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [165]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [166]:
X_train

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7000
873,47.0,9.0000
182,9.0,31.3875
876,20.0,9.8458
...,...,...
534,30.0,8.6625
584,,8.7125
493,71.0,49.5042
527,,221.7792


# Without missing indicator

In [167]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [168]:

clf = LogisticRegression()

clf.fit(X_train_trf,y_train)

y_pred = clf.predict(X_test_trf)

accuracy_score(y_test,y_pred)

0.6145251396648045

# Using missing indicator

In [169]:
mi = MissingIndicator()

In [170]:
X_train['Age_NA'] = mi.fit_transform(X_train)
X_test['Age_NA'] = mi.transform(X_test)

In [171]:
X_train

Unnamed: 0,Age,Fare,Age_NA
30,40.0,27.7208,False
10,4.0,16.7000,False
873,47.0,9.0000,False
182,9.0,31.3875,False
876,20.0,9.8458,False
...,...,...,...
534,30.0,8.6625,False
584,,8.7125,True
493,71.0,49.5042,False
527,,221.7792,True


In [172]:
X_test

Unnamed: 0,Age,Fare,Age_NA
707,42.0,26.2875,False
37,21.0,8.0500,False
615,24.0,65.0000,False
169,28.0,56.4958,False
68,17.0,7.9250,False
...,...,...,...
89,24.0,8.0500,False
80,22.0,9.0000,False
846,,69.5500,True
870,26.0,7.8958,False


In [173]:
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_tr2 = si.transform(X_test)

In [174]:
pd.DataFrame(X_train_trf2).isnull().mean()

0    0.0
1    0.0
2    0.0
dtype: float64

In [175]:
clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred_trf = clf.predict(X_test_tr2)

print(accuracy_score(y_test,y_pred_trf))

0.6312849162011173


# Sklearn

In [176]:
df = pd.read_csv("train2.csv",usecols=['Age','Fare','Survived'])
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
886,0,27.0,13.0000
887,1,19.0,30.0000
888,0,,23.4500
889,1,26.0,30.0000


In [177]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [178]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [179]:
X_train

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7000
873,47.0,9.0000
182,9.0,31.3875
876,20.0,9.8458
...,...,...
534,30.0,8.6625
584,,8.7125
493,71.0,49.5042
527,,221.7792


In [180]:
X_train.isnull().sum()

Age     148
Fare      5
dtype: int64

In [181]:
trf = ColumnTransformer(transformers=[
    ('imp',SimpleImputer(add_indicator=True),[0,1])
],remainder="passthrough")

In [182]:
X_train = trf.fit_transform(X_train)
X_test = trf.transform(X_test)

In [183]:
clf = LogisticRegression()

clf.fit(X_train,y_train)

y_pred_t = clf.predict(X_test)

print(accuracy_score(y_test,y_pred_t))

0.6368715083798883
