In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression

In [29]:
input_file = "../simple_gen_missing_data.csv"

df = pd.read_csv(input_file)
vars = df.columns[1:]

# Mode Imputation

In [17]:
# Imputation by most common value per variable (aka popularity)
output_file = "../simple_gen_popularity_imputation.csv"
df_popular_imp = df.copy()

for var in vars:
    mode_value = df_popular_imp[var].mode()[0]
    print(f'Variable {var}: mode value {mode_value}')
    df_popular_imp[var].fillna(mode_value, inplace=True)

df_popular_imp.to_csv(output_file, index=False, na_rep='')

Variable X1: mode value 0.0
Variable X2: mode value 0.0
Variable X3: mode value 0.0
Variable X4: mode value 0.0
Variable X5: mode value 0.0
Variable X6: mode value 0.0
Variable X7: mode value 0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_popular_imp[var].fillna(mode_value, inplace=True)


# Imputation by Prevalence

In [None]:
# Imputation by probability distribution of existing values
output_file = "../simple_gen_probability_imputation.csv"
df_prob_imp = df.copy()

for var in vars:
    # Get the distribution of non-missing values
    freq_table = df_prob_imp[var].value_counts(dropna=True)
    distinct_vals = freq_table.index.to_list()
    probabilities = (freq_table / freq_table.sum()).to_list()

    print(f'Variable {var}: values {distinct_vals} probability {probabilities}')
    
    # Identify which rows are missing
    missing_mask = df_prob_imp[var].isna()
    n_missing = missing_mask.sum()
    
    # Randomly sample from the distinct values, using the same distribution
    random_draws = np.random.choice(distinct_vals, size=n_missing, p=probabilities)
    
    # Fill in the missing values
    df_prob_imp.loc[missing_mask, var] = random_draws

df_prob_imp.to_csv(output_file, index=False)

Variable X1: values [0.0, 1.0] probability [0.8012985772939941, 0.1987014227060059]
Variable X2: values [0.0, 1.0] probability [0.8993218709953011, 0.10067812900469884]
Variable X3: values [0.0, 1.0] probability [0.8254610462928115, 0.17453895370718855]
Variable X4: values [0.0, 1.0] probability [0.8927264744621689, 0.10727352553783111]
Variable X5: values [0.0, 1.0] probability [0.9153321081648726, 0.08466789183512732]
Variable X6: values [0.0, 1.0] probability [0.9014970916372652, 0.09850290836273481]
Variable X7: values [0.0, 1.0] probability [0.8876616379310345, 0.11233836206896551]


# Logistic Regression Imputation

In [None]:
output_file = "../simple_gen_logreg_imputation.csv"

imputer = IterativeImputer(
    estimator=LogisticRegression(),
    max_iter=10,
    random_state=0
)

df_imputed_array = imputer.fit_transform(df)

df_logreg_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)
df_logreg_imputed.to_csv(output_file, index=False)

# kNN Imputation

In [None]:
output_file = "../simple_gen_kNN_imputation.csv"

# Create and fit the KNNImputer
imputer = KNNImputer(n_neighbors=4)
imputed_data = imputer.fit_transform(df)

df_kNN_imputed = pd.DataFrame(imputed_data, columns=df.columns)
df_kNN_imputed.to_csv(output_file, index=False)