# Missing data imputation

In [2]:
import random
import pandas as pd
import numpy as np

## Removing observations with missing data

- Complete Case Analysis (CCA)
- can be applied to categorical and numerical variables
- preserves the distribution of the variables (malo manjkajočih vrednosti, vrednosti manjkajo random)
- ni preveč manjkajočih vrednosti

In [3]:
# Load the data with the following command
data = pd.read_csv('data/crx.data', header=None)
varnames = ['A'+str(s) for s in range(1,17)]
data.columns = varnames
data = data.replace('?', np.nan)
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')
data['A16'] = data['A16'].map({'+':1, '-':0})

# Add some missing values at random positions in four variables:
random.seed(9001)
values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))
for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan
    
# Save your prepared data
data.to_csv('data/creditApprovalUCI.csv', index=False)

data = pd.read_csv('data/creditApprovalUCI.csv')

In [4]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
data.isnull().mean().sort_values(ascending=True) * 100

A11     0.000000
A12     0.000000
A13     0.000000
A15     0.000000
A16     0.000000
A4      0.869565
A5      0.869565
A6      1.304348
A7      1.304348
A1      1.739130
A2      1.739130
A14     1.884058
A3     13.333333
A8     13.333333
A9     13.333333
A10    13.333333
dtype: float64

In [6]:
data_cca = data.dropna()

In [7]:
print(f'Number of total observations: {len(data)}')
print(f'Number of observations with complete cases: {len(data_cca)}')

Number of total observations: 690
Number of observations with complete cases: 564


## Performing mean or median imputation

**Mean or median imputation consists of replacing missing values with the variable mean or
median**. 


- can only be performed in numerical variables
- mean or the median is calculated using a train set
- Use mean imputation if variables are normally distributed
- Use median imputation if variables are **not** normally distributed

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

Pandas primer:

In [11]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [13]:
X_train.isnull().mean() * 100

A1      0.828157
A2      2.277433
A3     14.078675
A4      0.828157
A5      0.828157
A6      0.828157
A7      0.828157
A8     14.078675
A9     14.078675
A10    14.078675
A11     0.000000
A12     0.000000
A13     0.000000
A14     1.449275
A15     0.000000
dtype: float64

In [14]:
for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
    value = X_train[var].median()
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

scikit-learn primer:

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11', 'A15']], data['A16'], test_size=0.3, random_state=0)

> np array ne sme vsebovati kategoričnih vrednosti

In [18]:
imputer = SimpleImputer(strategy='median')
# imputer = SimpleImputer(strategy = 'mean')

In [19]:
imputer.fit(X_train)

In [20]:
# Let's inspect the learned median values:
imputer.statistics_

array([28.835,  2.75 ,  1.   ,  0.   ,  6.   ])

In [21]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [22]:
X_train

array([[4.608e+01, 3.000e+00, 2.375e+00, 8.000e+00, 4.159e+03],
       [1.592e+01, 2.875e+00, 8.500e-02, 0.000e+00, 0.000e+00],
       [3.633e+01, 2.125e+00, 8.500e-02, 1.000e+00, 1.187e+03],
       ...,
       [1.958e+01, 6.650e-01, 1.665e+00, 0.000e+00, 5.000e+00],
       [2.283e+01, 2.290e+00, 2.290e+00, 7.000e+00, 2.384e+03],
       [4.058e+01, 3.290e+00, 3.500e+00, 0.000e+00, 0.000e+00]])

In [24]:
#pd.DataFrame(X_train, columns = ['A2', 'A3', 'A8', 'A11', 'A15'])

SimpleImputer() returns NumPy arrays

---

feature_engine primer:

In [26]:
from feature_engine.imputation import MeanMedianImputer

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# To perform mean imputation, change the imputation method, as follows: MeanMedianImputer(imputation_method='mean').
median_imputer = MeanMedianImputer(imputation_method='median', variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [27]:
median_imputer.fit(X_train)

In [28]:
median_imputer.imputer_dict_

{'A2': 28.835, 'A3': 2.75, 'A8': 1.0, 'A11': 0.0, 'A15': 6.0}

In [29]:
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)

In [30]:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


MeanMedianImputer() returns a dataframe

In [31]:
X_train[['A2','A3', 'A8', 'A11', 'A15']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

### Mean / median imputation with Sklearn selecting features to impute

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [34]:
# load data
data = pd.read_csv('data/creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']

# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer', numeric_mean_imputer, numeric_features_mean)
    ], remainder='passthrough')

# now we fit the preprocessor
preprocessor.fit(X_train)

In [35]:
# and now we impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [36]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

array([[46.08, 3.0, 2.375, ..., 't', 'g', 396.0],
       [15.92, 2.875, 0.085, ..., 'f', 'g', 120.0],
       [36.33, 2.125, 0.085, ..., 'f', 'g', 50.0],
       ...,
       [19.58, 0.665, 1.665, ..., 'f', 'g', 220.0],
       [22.83, 2.29, 2.29, ..., 't', 'g', 140.0],
       [40.58, 3.29, 3.5, ..., 't', 's', 400.0]], dtype=object)

## Implementing mode or frequent category imputation

**Mode imputation consists of replacing missing values with the mode.** 


- categorical variables
- using the train set and then used to impute values in train, test
- če je vliko število manjkajočih vrednosti, lahko ta imputacija privede do tega da se spremni destribucija dataseta

In [37]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [38]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


pandas:

In [39]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [40]:
for var in ['A4', 'A5', 'A6', 'A7']:
    value = X_train[var].mode()[0]
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

scikitlearn:

In [49]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [50]:
imputer = SimpleImputer(strategy='most_frequent')

In [51]:
imputer.fit(X_train)

In [52]:
imputer.statistics_

array(['u', 'g', 'c', 'v'], dtype=object)

In [53]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

Feature-engine:

In [54]:
from feature_engine.imputation import CategoricalImputer

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

mode_imputer = CategoricalImputer(imputation_method="frequent", variables=['A4', 'A5', 'A6','A7'])

In [55]:
mode_imputer.fit(X_train)

In [56]:
mode_imputer.imputer_dict_

{'A4': 'u', 'A5': 'g', 'A6': 'c', 'A7': 'v'}

In [57]:
X_train = mode_imputer.transform(X_train)
X_test = mode_imputer.transform(X_test)

In [58]:
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


### Pipeline example

In [60]:
# load data
data = pd.read_csv('data/creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']
category_features_mode = ['A4', 'A5', 'A6','A7']

# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer_n_mean', SimpleImputer(strategy='mean')),
])

category_features_mode_imputer = Pipeline(steps=[
    ('imputer_c_mode', SimpleImputer(strategy='most_frequent')),
])

# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer', numeric_mean_imputer, numeric_features_mean),
    ('mode_c_imputer', category_features_mode_imputer, category_features_mode)
    ], remainder='passthrough')

# now we fit the preprocessor
preprocessor.fit(X_train)

## Replacing missing values with an arbitrary number

- ponavadi je to 999, 9999, or -1
- numerical variables
- nesmemo izbrat vrednosti ki so blizu mean, median ali drugih vrednsoti v distribuciji
- data is not missing at random
- building non-linear models
- percentage of missing data is high
- distorts the original variable distribution

In [71]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value=99)

imputer.fit(X_train)

In [72]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

https://feature-engine.trainindata.com/en/latest/api_doc/imputation/ArbitraryNumberImputer.html

## Capturing missing values in a bespoke category

- common to replace missing values with the Missing string
- categorical variables

In [73]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='constant', fill_value='Missing')
imputer.fit(X_train)

In [74]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [75]:
X_train[35:45]

array([['u', 'g', 'c', 'bb'],
       ['y', 'p', 'ff', 'ff'],
       ['y', 'p', 'ff', 'ff'],
       ['u', 'g', 'q', 'v'],
       ['Missing', 'Missing', 'Missing', 'Missing'],
       ['y', 'p', 'c', 'h'],
       ['u', 'g', 'd', 'v'],
       ['y', 'p', 'aa', 'v'],
       ['y', 'p', 'j', 'v'],
       ['u', 'g', 'k', 'v']], dtype=object)

> https://feature-engine.trainindata.com/en/latest/api_doc/imputation/CategoricalImputer.html: The CategoricalImputer() replaces missing data in categorical variables by an arbitrary value or by the most frequent category.

## Replacing missing values with a value at the end of the distribution

- may distort the distribution of the original variables, so it may not be suitable for linear models

In [76]:
from feature_engine.imputation import EndTailImputer


data = pd.read_csv('data/creditApprovalUCI.csv')


X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [77]:
imputer = EndTailImputer(imputation_method='iqr', tail='right', variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [78]:
imputer.fit(X_train)

In [79]:
imputer.imputer_dict_

{'A2': 88.18,
 'A3': 27.31,
 'A8': 11.504999999999999,
 'A11': 12.0,
 'A15': 1800.0}

In [80]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Implementing random sample imputation

- preserves the original distribution
- suitable for numerical and categorical variables alike

In [81]:
from feature_engine.imputation import RandomSampleImputer

In [82]:
data = pd.read_csv('data/creditApprovalUCI.csv')

X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [83]:
imputer = RandomSampleImputer()
imputer.fit(X_train)

In [84]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Performing multivariate imputation by chained equations

- Multivariate imputation by chained equations (MICE)

In [85]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split

In [86]:
variables = ['A2','A3','A8', 'A11', 'A14', 'A15', 'A16']
data = pd.read_csv('data/creditApprovalUCI.csv', usecols=variables)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1),data['A16' ], test_size=0.3, random_state=0)

In [89]:
imputer = IterativeImputer(estimator = BayesianRidge(), max_iter=10, random_state=0)

In [90]:
imputer.fit(X_train)

In [91]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

## Assembling an imputation pipeline with scikit-learn

In [92]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [94]:
data = pd.read_csv('data/creditApprovalUCI.csv')

In [95]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [96]:
features_num_arbitrary = ['A3', 'A8']
features_num_median = ['A2', 'A14']
features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
features_cat_missing = ['A1', 'A9', 'A10']

In [97]:
imputer_num_arbitrary = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99)),
    ])

imputer_num_median = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ])

imputer_cat_frequent = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ])

imputer_cat_missing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',
    fill_value='Missing')),
])

In [98]:
preprocessor = ColumnTransformer(transformers=[
    ('imp_num_arbitrary', imputer_num_arbitrary, features_num_arbitrary),
    ('imp_num_median', imputer_num_median, features_num_median),
    ('imp_cat_frequent', imputer_cat_frequent, features_cat_frequent),
    ('imp_cat_missing', imputer_cat_missing, features_cat_missing),
], remainder='passthrough')

In [99]:
preprocessor.fit(X_train)

In [100]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [101]:
X_train

array([[3.0, 2.375, 46.08, ..., 't', 'g', 4159],
       [2.875, 0.085, 15.92, ..., 'f', 'g', 0],
       [2.125, 0.085, 36.33, ..., 'f', 'g', 1187],
       ...,
       [0.665, 1.665, 19.58, ..., 'f', 'g', 5],
       [2.29, 2.29, 22.83, ..., 't', 'g', 2384],
       [3.29, 3.5, 40.58, ..., 't', 's', 0]], dtype=object)