In [10]:
import warnings

# turn off warnings
warnings.filterwarnings("ignore")

Let's load a sample data with some missing values.

In [7]:
import pandas as pd

url = 'https://raw.githubusercontent.com/gakudo-ai/open-datasets/refs/heads/main/employees_dataset_with_missing.csv'
df = pd.read_csv(url)
print(f"Loaded dataset shape: {df.shape}")
print(f"Missing values per column:\n{df.isnull().sum()}")

Loaded dataset shape: (1000, 5)
Missing values per column:
age                116
income             149
education_years     91
experience         127
credit_score         0
dtype: int64


We can see that there are missing values from the data

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

iterative_imputer = IterativeImputer(random_state=42, max_iter=10)
df_iterative = pd.DataFrame(
    iterative_imputer.fit_transform(df),
    columns=df.columns,
    index=df.index
)

print("\n1. Iterative Imputation (MICE):")
print(f"Full dataset shape: {df_iterative.shape}")
print(f"Number of missing values: {df_iterative.isnull().sum().sum()}")


1. Iterative Imputation (MICE):
Full dataset shape: (1000, 5)
Number of missing values: 0


The iterative method uses Multiple Imputation by Chained Equations that allows various estimators to be used. The default model is BayesianRidge and we will use the Random Forest approach below.

In [11]:
rf_iterative_imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    random_state=42,
    max_iter=5
)
df_rf_iterative = pd.DataFrame(
    rf_iterative_imputer.fit_transform(df),
    columns=df.columns,
    index=df.index
)

df_rf_iterative.head()

Unnamed: 0,age,income,education_years,experience,credit_score
0,36.805366,70990.331549,11.974465,0.460962,563.65064
1,33.617357,63869.505244,13.566444,5.698075,646.879651
2,41.476885,50894.455549,11.62274,7.931972,651.801687
3,50.230299,40295.948334,13.076115,19.438438,697.263035
4,32.658466,60473.349704,8.319156,12.782766,513.314164


Of course, we can also use KNN imputation by setting the number of neighboring instances.

In [12]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
df_knn = pd.DataFrame(
    knn_imputer.fit_transform(df),
    columns=df.columns,
    index=df.index
)

print("\n2. KNN Imputation:")
print(f"Using {knn_imputer.n_neighbors} nearest neighbors")
print(f"Remaining missing values: {df_knn.isnull().sum().sum()}")


2. KNN Imputation:
Using 5 nearest neighbors
Remaining missing values: 0


Now let's compare the performance of the different imputation methods and see how they affect the model's performance.

In [13]:
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

imputers = {
    'bayesian_ridge': IterativeImputer(estimator=BayesianRidge(), random_state=42),
    'extra_trees': IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=42), random_state=42),
    'rf_regressor': IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=42), random_state=42)
}

imputed_datasets = {}
for name, imputer in imputers.items():
    imputed_datasets[name] = pd.DataFrame(
        imputer.fit_transform(df), 
        columns=df.columns,
        index=df.index
    )

print("\n3. Imputed Dataset Versions based on Different Estimators:")
for name, dataset in imputed_datasets.items():
    print(f"{name}: Mean income = ${dataset['income'].mean():.2f}")


3. Imputed Dataset Versions based on Different Estimators:
bayesian_ridge: Mean income = $51056.16
extra_trees: Mean income = $50992.66
rf_regressor: Mean income = $50957.85
