#### Missing Indicator
- Can be done using either a dedicated MissingIndicator class or by using SimpleImputer > add_indicator attribute.
- The MissingIndicator class in Python's scikit-learn library is a transformer used to create a binary matrix indicating the presence of missing values in a dataset.
- More details below.

In [29]:
import pandas as pd

Titanic_DF = pd.read_csv(r"C:\Users\ACER\Desktop\Kranthi\DataScience_Desktop\MachineLearningFiles\train.csv")
Titanic_DF.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [31]:
Titanic_DF = Titanic_DF[['Age','Fare','Pclass','Survived']]
Titanic_DF.head(3)

Unnamed: 0,Age,Fare,Pclass,Survived
0,22.0,7.25,3,0
1,38.0,71.2833,1,1
2,26.0,7.925,3,1


In [33]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(Titanic_DF[['Age','Fare','Pclass']],Titanic_DF['Survived'])

In [35]:
x_train.head(3)

Unnamed: 0,Age,Fare,Pclass
569,32.0,7.8542,3
431,,16.1,3
73,26.0,14.4542,3


In [171]:
# Check for nulls count in Age, Fare and Pclass columns

x_train.isnull().sum()

Age       131
Fare        0
Pclass      0
dtype: int64

In [173]:
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer

# This will return True, False for missing values - only one column as output.
# This does not replace nulls with anything.
# If we give multiple columns as input, even if one column has null, it shows indicator as True
MissingIndicator_Obj = MissingIndicator()

# Simple imputer will impute Mean, Median, Mode or Constant, Mean by default.
# If we give add_indicator paramter as True, it will create another column at the end
# The last column at end will have 0 or 1, 0 - Not null/nothing imputed, 1-nulls were present and it is imputed with mean/median/mode, etc.
# If we give multiple columns as input, even if one column has null & its imputed, indicator column shows indicator as 1, else 0.
# Default is False if add_indicator attrubute is not given

# Here stratergy by default is mean, i.e. nulls are replaced by mean, no additional column is created as default add_indicator is false
SimpleImputer_ObjAddIndF = SimpleImputer(add_indicator=False)
#SimpleImputer_ObjAddIndF = SimpleImputer()

# Here stratergy is median and an additional column is created for indicator as add_indicator value is given as True
SimpleImputer_ObjAddIndT = SimpleImputer(strategy='median', add_indicator=True) 

# Fit and Transform the data

x_train_trasf_MI = MissingIndicator_Obj.fit_transform(x_train)
x_test_transf_MI = MissingIndicator_Obj.transform(x_test)

x_train_trasf_SI_F = SimpleImputer_ObjAddIndF.fit_transform(x_train)
x_test_transf_SI_F = SimpleImputer_ObjAddIndF.transform(x_test)

x_train_trasf_SI_T = SimpleImputer_ObjAddIndT.fit_transform(x_train)
x_test_transf_SI_T = SimpleImputer_ObjAddIndT.transform(x_test)
# Returns all the columns of x_train/x_test along with one additional column which has 0(Not null) or 1(Null)

In [175]:
MissingIndicator_Obj

In [177]:
x_train_trasf_MI[0:5]
# Returns only True or False

array([[False],
       [ True],
       [False],
       [False],
       [False]])

In [179]:
# Number of features given i.e. columns given > 'Age','Fare','Pclass'

MissingIndicator_Obj.n_features_in_

3

In [187]:
# Combine all the columns after Missing Indicator is applied.

x_train_indicator = x_train.copy()
x_train_indicator['Missing_Indicator'] = x_train_trasf_MI
x_train_indicator.head(5)
#x_train_indicator[x_train_indicator.Age.isnull() == True]
#x_train_indicator[x_train_indicator.Fare.isnull() == True] >>> No records as there are no nulls in Fare and Pclass

Unnamed: 0,Age,Fare,Pclass,Missing_Indicator
569,32.0,7.8542,3,False
431,,16.1,3,True
73,26.0,14.4542,3,False
227,20.5,7.25,3,False
555,62.0,26.55,1,False


In [189]:
# Simple Imputer with default parameters. Imputes Mean only, does not create any new column for indicator
SimpleImputer_ObjAddIndF

In [199]:
SimpleImputer_ObjAddIndF.statistics_
# Mean for col1, Mean for col2, etc.

array([28.82899441, 32.55183249,  2.31736527])

In [193]:
x_train_trasf_SI_F[0:5]

array([[32.        ,  7.8542    ,  3.        ],
       [28.82899441, 16.1       ,  3.        ],
       [26.        , 14.4542    ,  3.        ],
       [20.5       ,  7.25      ,  3.        ],
       [62.        , 26.55      ,  1.        ]])

In [216]:
x_train_trasf_SI_F_DF = pd.DataFrame(x_train_trasf_SI_F, columns=x_train.columns)
x_train_trasf_SI_F_DF.head(3)

Unnamed: 0,Age,Fare,Pclass
0,32.0,7.8542,3.0
1,28.828994,16.1,3.0
2,26.0,14.4542,3.0


In [247]:
x_train_trasf_SI_F_DF = pd.DataFrame(x_train_trasf_SI_F, columns=x_train.columns + ' Imputed')
x_train_trasf_SI_F_DF.head(3)

Unnamed: 0,Age Imputed,Fare Imputed,Pclass Imputed
0,32.0,7.8542,3.0
1,28.828994,16.1,3.0
2,26.0,14.4542,3.0


In [249]:
# Simple Imputer with add_indicator=True and strategy='median' parameters. 
# Imputes Median for nulls and creates a new column for indicator with 0 - when there are no nulls, 1 - when there is null and its imputed.

SimpleImputer_ObjAddIndT

In [251]:
SimpleImputer_ObjAddIndT.statistics_
# Median for col1, Mean for col2, etc.

array([27.    , 14.4583,  3.    ])

In [253]:
x_train_trasf_SI_T

array([[32.    ,  7.8542,  3.    ,  0.    ],
       [27.    , 16.1   ,  3.    ,  1.    ],
       [26.    , 14.4542,  3.    ,  0.    ],
       ...,
       [16.    , 20.25  ,  3.    ,  0.    ],
       [34.5   ,  6.4375,  3.    ,  0.    ],
       [ 2.    , 29.125 ,  3.    ,  0.    ]])

In [255]:
x_train_trasf_SI_T_DF = pd.DataFrame(x_train_trasf_SI_T, columns=['Age Imputed','Fare Imputed','Pclass Imputed','Indicator Imputed'])
x_train_trasf_SI_T_DF.head(3)

Unnamed: 0,Age Imputed,Fare Imputed,Pclass Imputed,Indicator Imputed
0,32.0,7.8542,3.0,0.0
1,27.0,16.1,3.0,1.0
2,26.0,14.4542,3.0,0.0
