In [1]:
# Based on: https://www.machinelearningplus.com/machine-learning/mice-imputation/#google_vignette
# need to enable iterative imputer explicitly since its still experimental

# For understanding imputation please read: https://scikit-learn.org/stable/modules/impute.html

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
import pandas as pd
file_path = "https://raw.githubusercontent.com/armandoordonez/eda_couse/main/data/Churn_Modelling_m.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619.0,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608.0,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502.0,France,,,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699.0,France,,39.0,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850.0,Spain,Female,43.0,2,,1,1,1,79084.1,0


In [3]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [4]:
# Use Numeric Features
df_train = df.loc[:, ["Balance", "Age", "Exited"]]
df_train.head()

Unnamed: 0,Balance,Age,Exited
0,0.0,42.0,1
1,83807.86,41.0,0
2,159660.8,,1
3,0.0,39.0,0
4,,43.0,0


In [5]:
# fit on the dataset
imputer.fit(df_train)

In [6]:
df_imputed = imputer.transform(df_train)
df_imputed[:10]

array([[0.00000000e+00, 4.20000000e+01, 1.00000000e+00],
       [8.38078600e+04, 4.10000000e+01, 0.00000000e+00],
       [1.59660800e+05, 4.47681408e+01, 1.00000000e+00],
       [0.00000000e+00, 3.90000000e+01, 0.00000000e+00],
       [7.25930035e+04, 4.30000000e+01, 0.00000000e+00],
       [1.13755780e+05, 4.40000000e+01, 1.00000000e+00],
       [0.00000000e+00, 5.00000000e+01, 0.00000000e+00],
       [1.15046740e+05, 2.90000000e+01, 1.00000000e+00],
       [1.42051070e+05, 4.40000000e+01, 0.00000000e+00],
       [1.34603880e+05, 2.70000000e+01, 0.00000000e+00]])

In [7]:
# Replace with imputed values
df.loc[:, ["Balance", "Age", "Exited"]] = df_imputed
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619.0,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608.0,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502.0,France,,44.768141,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699.0,France,,39.0,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850.0,Spain,Female,43.0,2,72593.003473,1,1,1,79084.1,0
5,6,15574012,Chu,645.0,Spain,Male,44.0,8,113755.78,2,1,0,,1
6,7,15592531,Bartlett,822.0,France,Male,50.0,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376.0,Germany,Female,29.0,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501.0,France,Male,44.0,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,,France,Male,27.0,2,134603.88,1,1,1,71725.73,0


In [8]:
!pip install miceforest --no-cache-dir

Collecting miceforest
  Downloading miceforest-6.0.3-py3-none-any.whl.metadata (35 kB)
Downloading miceforest-6.0.3-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: miceforest
Successfully installed miceforest-6.0.3


In [9]:
import miceforest as mf

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [10]:
# Create kernel.
kds = mf.ImputationKernel(
  df_train,
  random_state=100
)

# Run the MICE algorithm for 2 iterations
kds.mice(2)

# Return the completed dataset.
df_imputed = kds.complete_data()

In [11]:
df_train.head()

Unnamed: 0,Balance,Age,Exited
0,0.0,42.0,1
1,83807.86,41.0,0
2,159660.8,,1
3,0.0,39.0,0
4,,43.0,0


In [12]:
df_imputed.head()

# It has predicted a value of '50' for the missing record.
# Let's run for 5 more iterations and predict again.

Unnamed: 0,Balance,Age,Exited
0,0.0,42.0,1
1,83807.86,41.0,0
2,159660.8,50.0,1
3,0.0,39.0,0
4,0.0,43.0,0


In [13]:
kds.mice(iterations=10, n_estimators=50)

In [14]:
df_imputed2 = kds.complete_data()
df_imputed2.head()

Unnamed: 0,Balance,Age,Exited
0,0.0,42.0,1
1,83807.86,41.0,0
2,159660.8,55.0,1
3,0.0,39.0,0
4,128760.32,43.0,0


In [None]:
# The prediction has now changed from 50 to 41.

In [15]:
# Let's compare the actual value by loading the original data that does not contain the missing values

import pandas as pd
file_path = "https://raw.githubusercontent.com/armandoordonez/eda_couse/main/data/Churn_Modelling_orig.csv"
dfs = pd.read_csv(file_path)
dfs[['Balance', 'Age', 'Exited']].head()

Unnamed: 0,Balance,Age,Exited
0,0.0,42,1
1,83807.86,41,0
2,159660.8,42,1
3,0.0,39,0
4,125510.82,43,0
