# Missing Value

## Simple Imputer
1. Mean
2. Median
3. Mode
4. Constant

In [1]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame({
    'x1':[4,5,np.nan,6,7,9],
    'x2':[3,5,6,5,np.nan,5],
    'x3':[10,11,12,9,8,11],
    'x4':['A','A','C','C','D',np.nan],
    'x5':['X','Y','X','X',np.nan,'Y'],
    'x6':['M','M',np.nan,'M','N',np.nan]
    })

df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10,A,X,M
1,5.0,5.0,11,A,Y,M
2,,6.0,12,C,X,
3,6.0,5.0,9,C,X,M
4,7.0,,8,D,,N
5,9.0,5.0,11,,Y,


In [3]:
df.isna().sum()

x1    1
x2    1
x3    0
x4    1
x5    1
x6    2
dtype: int64

In [5]:
imp_num = SimpleImputer(strategy = 'mean')
df[['x1', 'x2', 'x3']] = imp_num.fit_transform(df[['x1', 'x2', 'x3']])
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,,N
5,9.0,5.0,11.0,,Y,


In [6]:
imp_mode = SimpleImputer(strategy = 'most_frequent')
df[['x4', 'x5']] = imp_mode.fit_transform(df[['x4', 'x5']])
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,X,N
5,9.0,5.0,11.0,A,Y,


In [7]:
imp_const = SimpleImputer(strategy = 'constant', fill_value = 'P')
df[['x6']] = imp_const.fit_transform(df[['x6']])
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,4.0,3.0,10.0,A,X,M
1,5.0,5.0,11.0,A,Y,M
2,6.2,6.0,12.0,C,X,P
3,6.0,5.0,9.0,C,X,M
4,7.0,4.8,8.0,D,X,N
5,9.0,5.0,11.0,A,Y,P


## Iterative Imputer

Note : Iterative Imputer only works for numurical variable.

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [20]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11.0],
    'x4':['A','A','C','C','D','D']
    })

df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,,6.3,,C
3,6.3,4.9,8.9,C
4,7.4,,9.1,D
5,9.1,5.4,11.0,D


In [13]:
imp_iter = IterativeImputer(max_iter = 10, random_state = 0)
df[['x1', 'x2', 'x3']] = imp_iter.fit_transform(df[['x1', 'x2', 'x3']])
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.18363,6.3,9.823389,C
3,6.3,4.9,8.9,C
4,7.4,5.073866,9.1,D
5,9.1,5.4,11.0,D


In [21]:
imp_iter = IterativeImputer()
df[['x1', 'x2', 'x3']] = imp_iter.fit_transform(df[['x1', 'x2', 'x3']])
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.18363,6.3,9.823389,C
3,6.3,4.9,8.9,C
4,7.4,5.073866,9.1,D
5,9.1,5.4,11.0,D


## Nearest Neighbour Imputer

In [23]:
from sklearn.impute import KNNImputer

In [30]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11.0],
    'x4':['A','A','C','C','D','D']
    })

df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,,6.3,,C
3,6.3,4.9,8.9,C
4,7.4,,9.1,D
5,9.1,5.4,11.0,D


In [31]:
imp_knn = KNNImputer(n_neighbors = 10)
df[['x1', 'x2', 'x3']] = imp_knn.fit_transform(df[['x1', 'x2', 'x3']])
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,6.44,6.3,9.82,C
3,6.3,4.9,8.9,C
4,7.4,4.92,9.1,D
5,9.1,5.4,11.0,D
