# Imputation

* In real life it is common to find datasets with missing information
* Information is usually represented as NaN or None
* Scikit-learn models do not work if the dataset has missing values

In [1]:
import pandas as pd

data =pd.DataFrame([
    ("red", 1, 1.0, -1), ("blue", 2, None, -3), (None, 3, 3.0, -5),
    ("red", 4, 4.0, -2), ("red", None, 5.0, -5), ("blue", 6, 6.0,-1),
    ("red", 7, None), ("blue", 8, 8.0, None), ("green", 9, 9.0, None),
    ("red", 10, 10.0, None),
], columns=["color", "number", "value", "other"])

data

Unnamed: 0,color,number,value,other
0,red,1.0,1.0,-1.0
1,blue,2.0,,-3.0
2,,3.0,3.0,-5.0
3,red,4.0,4.0,-2.0
4,red,,5.0,-5.0
5,blue,6.0,6.0,-1.0
6,red,7.0,,
7,blue,8.0,8.0,
8,green,9.0,9.0,
9,red,10.0,10.0,


In [2]:
from sklearn.impute import SimpleImputer

In [4]:
imputer = SimpleImputer()
imputer.fit(data[['value']])

In [7]:
# Use the average to impute the missing values (by default)

data['value_imputer'] = imputer.transform(data[['value']])
data

Unnamed: 0,color,number,value,other,value_imputer
0,red,1.0,1.0,-1.0,1.0
1,blue,2.0,,-3.0,5.75
2,,3.0,3.0,-5.0,3.0
3,red,4.0,4.0,-2.0,4.0
4,red,,5.0,-5.0,5.0
5,blue,6.0,6.0,-1.0,6.0
6,red,7.0,,,5.75
7,blue,8.0,8.0,,8.0
8,green,9.0,9.0,,9.0
9,red,10.0,10.0,,10.0


In [8]:
median_imputer = SimpleImputer(strategy='median')
median_imputer.fit(data[['number']])
data['number_imputed'] = median_imputer.transform(data[['number']])
data

Unnamed: 0,color,number,value,other,value_imputer,number_imputed
0,red,1.0,1.0,-1.0,1.0,1.0
1,blue,2.0,,-3.0,5.75,2.0
2,,3.0,3.0,-5.0,3.0,3.0
3,red,4.0,4.0,-2.0,4.0,4.0
4,red,,5.0,-5.0,5.0,6.0
5,blue,6.0,6.0,-1.0,6.0,6.0
6,red,7.0,,,5.75,7.0
7,blue,8.0,8.0,,8.0,8.0
8,green,9.0,9.0,,9.0,9.0
9,red,10.0,10.0,,10.0,10.0


### Categorical variables

In [9]:
# To categorical variables

color_imputer = SimpleImputer(strategy='most_frequent')
color_imputer.fit(data[['color']])
data['color_imputed'] = color_imputer.transform(data[['color']])
data

ValueError: 2

In [10]:
color_imputer.transform(data[['color']])

array([['red'],
       ['blue'],
       [None],
       ['red'],
       ['red'],
       ['blue'],
       ['red'],
       ['blue'],
       ['green'],
       ['red']], dtype=object)

In [11]:
color_imputer.transform(data[['color']]).squeeze()

# Squeeze delete the unnecessary arrays inside a column

array(['red', 'blue', None, 'red', 'red', 'blue', 'red', 'blue', 'green',
       'red'], dtype=object)

In [13]:
# To categorical variables

color_imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)
color_imputer.fit(data[['color']])
data['color_imputed'] = color_imputer.transform(data[['color']]).squeeze()
data

Unnamed: 0,color,number,value,other,value_imputer,number_imputed,color_imputed
0,red,1.0,1.0,-1.0,1.0,1.0,red
1,blue,2.0,,-3.0,5.75,2.0,blue
2,,3.0,3.0,-5.0,3.0,3.0,red
3,red,4.0,4.0,-2.0,4.0,4.0,red
4,red,,5.0,-5.0,5.0,6.0,red
5,blue,6.0,6.0,-1.0,6.0,6.0,blue
6,red,7.0,,,5.75,7.0,red
7,blue,8.0,8.0,,8.0,8.0,blue
8,green,9.0,9.0,,9.0,9.0,green
9,red,10.0,10.0,,10.0,10.0,red


### Define a costant value

In [14]:
constant_imputer = SimpleImputer(strategy='constant', fill_value=10)
constant_imputer.fit(data[['other']])
data['constant_imputer'] = constant_imputer.transform(data[['other']])
data

Unnamed: 0,color,number,value,other,value_imputer,number_imputed,color_imputed,constant_imputer
0,red,1.0,1.0,-1.0,1.0,1.0,red,-1.0
1,blue,2.0,,-3.0,5.75,2.0,blue,-3.0
2,,3.0,3.0,-5.0,3.0,3.0,red,-5.0
3,red,4.0,4.0,-2.0,4.0,4.0,red,-2.0
4,red,,5.0,-5.0,5.0,6.0,red,-5.0
5,blue,6.0,6.0,-1.0,6.0,6.0,blue,-1.0
6,red,7.0,,,5.75,7.0,red,10.0
7,blue,8.0,8.0,,8.0,8.0,blue,10.0
8,green,9.0,9.0,,9.0,9.0,green,10.0
9,red,10.0,10.0,,10.0,10.0,red,10.0
