In Missing Indicator we create a new column for every column with the missing values & this new column will have only two values either True or False. We will give False if the corresponding row has values else we will give True.

In [24]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import MissingIndicator,SimpleImputer

In [25]:
df = pd.read_csv('titanic_toy.csv',usecols=['Age','Fare','Survived'])

In [26]:
df.head()

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1
3,35.0,53.1,1
4,35.0,8.05,0


In [27]:
df.isnull().sum()

Age         177
Fare         45
Survived      0
dtype: int64

In [28]:
df.isnull().mean() * 100

Age         19.865320
Fare         5.050505
Survived     0.000000
dtype: float64

In [29]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [31]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


In [32]:
X_train.isnull().sum()

Age     148
Fare     36
dtype: int64

In [33]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)  # By default missing will be replaced with mean
X_test_trf = si.transform(X_test)

In [34]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [35]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()

LR_model.fit(X_train_trf, y_train)

y_pred = LR_model.predict(X_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6033519553072626

In [36]:
mi = MissingIndicator()

mi.fit(X_train)

mi.features_  # array([0, 1] --> Showing that only 1st column as missing values and in this case age here

array([0, 1], dtype=int64)

In [37]:
X_train_missing = mi.transform(X_train)

X_train_missing

array([[False, False],
       [False, False],
       [False, False],
       ...,
       [False, False],
       [ True, False],
       [ True, False]])

In [38]:
X_test_missing = mi.transform(X_test)

X_test_missing

array([[False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [ True, False],
       [ True, False],
       [False, False],
       [ True, False],
       [False, False],
       [False,  True],
       [False, False],
       [False, False],
       [ True, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [False,  True],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [ True, False],
       [False, False],
       [False, False],
       [False,  True],
       [False, False],
       [ True, False],
       [False, False],
       [False, False],
       [False, False],
       [False, False],
       [Fal

In [40]:
# X_train['Age_NA'] = X_train_missing

# X_test['Age_NA'] = X_test_missing

In [45]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [41]:
si = SimpleImputer(add_indicator=True)  # Missing Indicator has been added to scikit-learn we can directly use from SK Learn

In [42]:

X_train = si.fit_transform(X_train)

In [49]:
pd.DataFrame(X_train, columns=['Age', 'Fare', 'Survived', 'Age_NA'])

Unnamed: 0,Age,Fare,Survived,Age_NA
0,40.000000,27.7208,0.0,0.0
1,4.000000,16.7000,0.0,0.0
2,47.000000,9.0000,0.0,0.0
3,9.000000,31.3875,0.0,0.0
4,20.000000,9.8458,0.0,0.0
...,...,...,...,...
707,30.000000,8.6625,0.0,0.0
708,29.785904,8.7125,1.0,0.0
709,71.000000,49.5042,0.0,0.0
710,29.785904,221.7792,1.0,0.0


In [43]:
X_test = si.transform(X_test)

In [46]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6089385474860335