## Comparing Model Accuracy after Training with and without MissingIndicator while Imputing the Missing Values

### Imported the required libraries, modules and classes

In [39]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import MissingIndicator,SimpleImputer

In [40]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])  # Titanic Dataset

In [41]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [42]:
df.isnull().mean()*100

Survived     0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [43]:
X = df.drop(columns=['Survived'])   # Input Features
y = df['Survived']    # Target Feature

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [47]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


## Logistic Regression - Accuracy without MissingIndicator

### Defined SimpleImputer Object with Default Strategy Mean and Transformed Train and Test Data

In [48]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [49]:
X_train_trf   # Transformed data - Imputed with Mean

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [51]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [52]:
clf = LogisticRegression()   # Defined Clf as an object of LogisticRegression Class 

clf.fit(X_train_trf,y_train)  # Trained the model with Transformed X_train_trf.

y_pred = clf.predict(X_test_trf)  # Predicted Values for Transformed X_test_trf.

In [53]:
accuracy_score(y_test,y_pred)  # Accuracy without MissingIndicator

0.6145251396648045

### Logistic Regression - Accuracy without MissingIndicator

In [54]:
mi = MissingIndicator()    # Defined MissingIndicator Object to identify Features with Missing Values.

In [55]:
mi.fit(X_train)   # This step analyzes the data and identifies which features contain missing values.

In [57]:
mi.features_

array([0], dtype=int64)

In [58]:
X_train.head(1)

Unnamed: 0,Age,Fare
30,40.0,27.7208


In [59]:
X_train_missing = mi.transform(X_train)  # Created a Missing Indicator for X_train

In [60]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [61]:
X_test_missing = mi.transform(X_test)   # Created a Missing Indicator for X_test

In [62]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [64]:
X_train['Age_NA'] = X_train_missing    # Added MissingIndicator in the train data

In [65]:
X_train.sample(10)

Unnamed: 0,Age,Fare,Age_NA
532,17.0,7.2292,False
277,,0.0,True
588,22.0,8.05,False
861,21.0,11.5,False
138,16.0,9.2167,False
513,54.0,59.4,False
579,32.0,7.925,False
451,,19.9667,True
149,42.0,13.0,False
318,31.0,164.8667,False


In [66]:
X_test['Age_NA'] = X_test_missing      # Added MissingIndicator in the train data

In [67]:
X_test

Unnamed: 0,Age,Fare,Age_NA
707,42.0,26.2875,False
37,21.0,8.0500,False
615,24.0,65.0000,False
169,28.0,56.4958,False
68,17.0,7.9250,False
...,...,...,...
89,24.0,8.0500,False
80,22.0,9.0000,False
846,,69.5500,True
870,26.0,7.8958,False


In [68]:
si = SimpleImputer()                          # Created a SimpleImputer Object --- Default Imputation Strategy "Mean".

X_train_trf2 = si.fit_transform(X_train)      # Fit the Object on TRain Data and Transformed the train data.
X_test_trf2 = si.transform(X_test)            # Transformed the Test Data

### Accuracy Improved By 2%

In [69]:
clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

accuracy_score(y_test,y_pred)

0.6312849162011173

## After setting add_indicator=True in SimpleImputer

In [70]:
si = SimpleImputer(add_indicator=True)  # Default add_indicator = False

In [71]:
X_train = si.fit_transform(X_train)

In [72]:
X_test = si.transform(X_test)

In [73]:
clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

accuracy_score(y_test,y_pred)

0.6312849162011173

#### Here we can see the accuracy increased by 2% after using MissingIndicator while Imputing missing values.