In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# <center> Missing Value

## Simple Imputer

In [2]:
df = pd.DataFrame({
    'x1':[4,5,np.nan,6,7,9],
    'x2':[3,5,6,5,np.nan,5],
    'x3':[10,11,12,9,8,11],
    'x4':['A','A','C','C','D',np.nan],
    'x5':['M','M',np.nan,'M','N',np.nan]
})
df

Unnamed: 0,x1,x2,x3,x4,x5
0,4.0,3.0,10,A,M
1,5.0,5.0,11,A,M
2,,6.0,12,C,
3,6.0,5.0,9,C,M
4,7.0,,8,D,N
5,9.0,5.0,11,,


In [3]:
# Percobaan 1 : mengisi nilai nan sekaligus pada x1,x2, dan x3 dengan nilai mean masing masing kolom

In [4]:
df[['x1','x2','x3']].mean()

x1     6.200000
x2     4.800000
x3    10.166667
dtype: float64

In [5]:
imp_num=SimpleImputer(strategy='mean')
df[['x1','x2','x3']]= imp_num.fit_transform(df[['x1','x2','x3']])
df

Unnamed: 0,x1,x2,x3,x4,x5
0,4.0,3.0,10.0,A,M
1,5.0,5.0,11.0,A,M
2,6.2,6.0,12.0,C,
3,6.0,5.0,9.0,C,M
4,7.0,4.8,8.0,D,N
5,9.0,5.0,11.0,,


In [6]:
# Percobaan 2 : mengisi kolom di x4 dengan modus

In [7]:
df[['x4']].mode()

Unnamed: 0,x4
0,A
1,C


In [8]:
imp_mode = SimpleImputer(strategy='most_frequent')
df[['x4']]= imp_mode.fit_transform(df[['x4']])
df

Unnamed: 0,x1,x2,x3,x4,x5
0,4.0,3.0,10.0,A,M
1,5.0,5.0,11.0,A,M
2,6.2,6.0,12.0,C,
3,6.0,5.0,9.0,C,M
4,7.0,4.8,8.0,D,N
5,9.0,5.0,11.0,A,


In [9]:
# percobaan 3 : mengisi kolom di x5 dengan constant P

In [10]:
imp_cons = SimpleImputer(strategy='constant',fill_value='P')
df[['x5']]= imp_cons.fit_transform(df[['x5']])
df

Unnamed: 0,x1,x2,x3,x4,x5
0,4.0,3.0,10.0,A,M
1,5.0,5.0,11.0,A,M
2,6.2,6.0,12.0,C,P
3,6.0,5.0,9.0,C,M
4,7.0,4.8,8.0,D,N
5,9.0,5.0,11.0,A,P


## Itterative Imputer

cek data dan kolom dibandingkan dengan data yang tidak ada missing valuenya kemudian diisi dengan metode regresi

In [11]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11],
    'x4':['A','A','C','C','D','D']
})
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,,6.3,,C
3,6.3,4.9,8.9,C
4,7.4,,9.1,D
5,9.1,5.4,11.0,D


In [12]:
df.mean()

x1    6.44
x2    4.92
x3    9.82
dtype: float64

In [13]:
imp_iter = IterativeImputer(max_iter=10,random_state=0)
df[['x1','x2','x3']]=imp_iter.fit_transform(df[['x1','x2','x3']])
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.18363,6.3,9.823389,C
3,6.3,4.9,8.9,C
4,7.4,5.073866,9.1,D
5,9.1,5.4,11.0,D


## KNN Imputer

In [14]:
df = pd.DataFrame({
    'x1':[4.3,5.1,np.nan,6.3,7.4,9.1],
    'x2':[2.9,5.1,6.3,4.9,np.nan,5.4],
    'x3':[9,11.1,np.nan,8.9,9.1,11],
    'x4':['A','A','C','C','D','D']
})
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,,6.3,,C
3,6.3,4.9,8.9,C
4,7.4,,9.1,D
5,9.1,5.4,11.0,D


weights{‘uniform’, ‘distance’} or callable, default=’uniform’

Weight function used in prediction. Possible values:
* ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
* ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
* callable : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

In [15]:
imp_KNN = KNNImputer(n_neighbors=2,weights='uniform') #semakin banyak data semakin banyak neighbor
df[['x1','x2','x3']]=imp_KNN.fit_transform(df[['x1','x2','x3']])
df

Unnamed: 0,x1,x2,x3,x4
0,4.3,2.9,9.0,A
1,5.1,5.1,11.1,A
2,7.1,6.3,11.05,C
3,6.3,4.9,8.9,C
4,7.4,5.15,9.1,D
5,9.1,5.4,11.0,D
