In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [18]:
# Check sklearn version
import sklearn ## iterative imputer is in experimental stage only till sklearn 0.23, for version <0.24 use from sklearn.experimental import enable_iterative_imputer and then import IterativeImputer
sklearn.__version__

'0.24.2'

In [19]:
from sklearn.impute import IterativeImputer

In [21]:
# data creation
df = pd.DataFrame({
    'age': [25,27,29,31,33,np.nan],
    'experience': [np.nan, 3,5,7,9,11],
    'salary': [50, np.nan, 110,140,170,200],
    'purchased' : [0,1,1,0,1,0]
})
df

Unnamed: 0,age,experience,salary,purchased
0,25.0,,50.0,0
1,27.0,3.0,,1
2,29.0,5.0,110.0,1
3,31.0,7.0,140.0,0
4,33.0,9.0,170.0,1
5,,11.0,200.0,0


In [23]:
# Drop unwanted columns
X = df.drop('purchased', 1)
y = df['purchased']

  X = df.drop('purchased', 1)


In [24]:
# Check for correlations amongest the features
X.corr()

Unnamed: 0,age,experience,salary
age,1.0,1.0,1.0
experience,1.0,1.0,1.0
salary,1.0,1.0,1.0


In [25]:
lr = LinearRegression()
# I'm using Linear Regression because all the features are highly correlated with each other
# In most real life datasets, they will be not, and the need will arise to use other regressors

imp = IterativeImputer(estimator=lr, verbose=2, max_iter=30, tol=1e-10, imputation_order='roman')
# play around with the max_iter and tol parameters to get a better feel of how it is working

In [27]:
# to find "scaled tolerance", multiply the tolerance with the max of the absolute values
# in the current dataset the tolerance is 1e-10 and max abosolute value from the dataset is 200, so scaled tolerance will be below value
1e-10 * 200

2e-08

In [28]:
# Transform the dataset to fill the missing values
imp.fit_transform(X)

[IterativeImputer] Completing matrix with shape (6, 3)
[IterativeImputer] Ending imputation round 1/30, elapsed time 0.02
[IterativeImputer] Change: 61.22518987714511, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 2/30, elapsed time 0.02
[IterativeImputer] Change: 7.963767891095614, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 3/30, elapsed time 0.03
[IterativeImputer] Change: 0.7509179143103637, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 4/30, elapsed time 0.03
[IterativeImputer] Change: 0.01311646776997577, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 5/30, elapsed time 0.04
[IterativeImputer] Change: 0.0008142526488228441, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 6/30, elapsed time 0.04
[IterativeImputer] Change: 3.9720598635994975e-05, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 7/30, elapsed time 0.05
[IterativeImputer] Change: 2.141289925

array([[ 25.,   1.,  50.],
       [ 27.,   3.,  80.],
       [ 29.,   5., 110.],
       [ 31.,   7., 140.],
       [ 33.,   9., 170.],
       [ 35.,  11., 200.]])

# Using n_nearest_features parameter in the iterative Imputater
use n_nearest_features : int, default=None


In [32]:
# imagine we have a dataset with 6 features, one of which is age.
# Now we want to predict the NaNs in age
# so we find the absolute correlation coefficient between age and all other features, as given below
corr_values = [0.9, 0.5, 0.8, 0.4, 0.1]
print("corr_values: ",corr_values)
print("sum of corr_values",np.sum(corr_values))
# So corelation coeeficient for feature will be
print("correlation coeefcient for feature1: ", 0.9/2.7)

corr_values:  [0.9, 0.5, 0.8, 0.4, 0.1]
sum of corr_values 2.7
correlation coeefcient for feature1:  0.3333333333333333


In [34]:
# we'll normalize the absolute correlation coefficients to have a sum of one
from sklearn.preprocessing import normalize
probs = normalize([corr_values], norm='l1')
probs = probs.ravel()
print("Correlation coeeficinets for the features: ",probs)
print("Sum of Correlation coeeficinets for the features: ",probs.sum())
# now we'll pick the number of neighbors we want (2 in this case) and set
# the weight/probabilites parameter as the 'probs' we calculated above
# so that numpy assigns the proportional weight to each feature according
# to the correlation of that feature with the target feature (age in this case)
np.random.choice([1,2,3,4,5], 2, replace=False, p=probs)

Correlation coeeficinets for the features:  [0.33333333 0.18518519 0.2962963  0.14814815 0.03703704]
Sum of Correlation coeeficinets for the features:  1.0


array([3, 1])

# Working of  Iterative Imputer in the case of training and testing sets


In [35]:
# Data, feature and target creation
df = pd.DataFrame({
    'age': [25,27,29,31,33,np.nan,37,39,41,np.nan,45],
    'experience': [np.nan, 3,5,7,9,11,13,16,np.nan,19,21],
    'salary': [50, np.nan, 110,140,170,200,230,260,np.nan,320,350],
    'purchased' : [0,1,1,0,1,0,0,1,1,0,0]
})
X = df.drop('purchased', 1)
y = df['purchased']

  X = df.drop('purchased', 1)


In [37]:
# Spliting of train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train

Unnamed: 0,age,experience,salary
7,39.0,16.0,260.0
1,27.0,3.0,
9,,19.0,320.0
3,31.0,7.0,140.0
8,41.0,,
4,33.0,9.0,170.0
2,29.0,5.0,110.0
0,25.0,,50.0


In [44]:
# fiting iterative imputer for recreation of missing values
lr = LinearRegression()
imp = IterativeImputer(estimator=lr, tol=1e-10, max_iter=1, verbose=2, imputation_order='roman') #by default is mean
print("Transformed training feature matrix: \n",imp.fit_transform(X_train))
# cross check filled values with the mean values
print("X_train.mean: \n",X_train.mean())

[IterativeImputer] Completing matrix with shape (8, 3)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.00
[IterativeImputer] Change: 96.64863952063612, scaled tolerance: 3.2e-08 
Transformed training feature matrix: 
 [[ 39.          16.         260.        ]
 [ 27.           3.          78.35136048]
 [ 46.3489256   19.         320.        ]
 [ 31.           7.         140.        ]
 [ 41.          15.52952456 264.85940454]
 [ 33.           9.         170.        ]
 [ 29.           5.         110.        ]
 [ 25.           1.7842604   50.        ]]
X_train.mean: 
 age            32.142857
experience      9.833333
salary        175.000000
dtype: float64




In [50]:
# Check any missing values in the X test
print("X_test: \n",X_test)
# Check for the imputation_sequence_ steps
imp.imputation_sequence_
# Trasform the test data from the object created with training data
imp.transform(X_test)

X_test: 
      age  experience  salary
6   37.0        13.0   230.0
5    NaN        11.0   200.0
10  45.0        21.0   350.0
[IterativeImputer] Completing matrix with shape (3, 3)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.00


array([[ 37.       ,  13.       , 230.       ],
       [ 35.8400104,  11.       , 200.       ],
       [ 45.       ,  21.       , 350.       ]])