In [1]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import MissingIndicator,SimpleImputer

In [2]:
# load datasets
df = pd.read_csv('data/titanic_temp.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1
3,35.0,53.1,1
4,35.0,8.05,0


In [3]:
df.isnull().mean()*100

Age         19.865320
Fare         5.050505
Survived     0.000000
dtype: float64

In [4]:
df = df.dropna(subset=['Fare'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 846 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       675 non-null    float64
 1   Fare      846 non-null    float64
 2   Survived  846 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 26.4 KB


In [5]:
#split data into train test
X = df.drop(columns=['Survived'])
y = df['Survived']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
X_train.head()

Unnamed: 0,Age,Fare
603,44.0,8.05
550,17.0,110.8833
344,36.0,13.0
654,18.0,6.75
132,47.0,14.5


In [7]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)

X_test_trf = si.transform(X_test)
X_train_trf

array([[ 44.        ,   8.05      ],
       [ 17.        , 110.8833    ],
       [ 36.        ,  13.        ],
       ...,
       [ 29.50031776,   7.2292    ],
       [ 39.        ,  79.65      ],
       [ 21.        ,   7.925     ]])

### without is_missing column

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()

clf.fit(X_train_trf,y_train)

y_pred = clf.predict(X_test_trf)

accuracy_score(y_test,y_pred)

0.6705882352941176

### with is_missing column

In [9]:
mi = MissingIndicator()

mi.fit(X_train)

MissingIndicator()

In [10]:
mi.features_

array([0], dtype=int64)

In [11]:
train_is_missing = mi.transform(X_train)
train_is_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [

In [12]:
test_is_missing = mi.transform(X_test)
test_is_missing

array([[False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [ True],
       [False],
       [False],
       [

In [13]:
X_train['is_missing_age'] = train_is_missing
X_train.head()

Unnamed: 0,Age,Fare,is_missing_age
603,44.0,8.05,False
550,17.0,110.8833,False
344,36.0,13.0,False
654,18.0,6.75,False
132,47.0,14.5,False


In [14]:
X_test['is_missing_age'] = test_is_missing
X_test.head()

Unnamed: 0,Age,Fare,is_missing_age
234,24.0,10.5,False
424,18.0,20.2125,False
358,,7.8792,True
391,21.0,7.7958,False
268,58.0,153.4625,False


In [15]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [16]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6705882352941176

### By using sklearn simple imputer

In [18]:
si = SimpleImputer(add_indicator=True)

X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [19]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6705882352941176