In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [33]:
input_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_missing_data.csv"

df = pd.read_csv(input_file)
vars = df.columns[1:]

# Mode Imputation

In [15]:
# Imputation by most common value per variable (aka popularity)
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_popularity_imputation.csv"
df_popular_imp = df.copy()

for var in vars:
    mode_value = df_popular_imp[var].mode()[0]
    print(f'Variable {var}: mode value {mode_value}')
    df_popular_imp[var].fillna(mode_value, inplace=True)

df_popular_imp.to_csv(output_file, index=False, na_rep='')

Variable X1: mode value -3.496535301279392
Variable X2: mode value -0.9999892647101024
Variable X3: mode value -7.329989629035797
Variable X4: mode value -16.24273362709853
Variable X5: mode value -14.880972504241582
Variable X6: mode value -6.873857236561371
Variable X7: mode value -11.75161442543215


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_popular_imp[var].fillna(mode_value, inplace=True)


# Imputation by Prevalence

In [16]:
# Imputation by probability distribution of existing values
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_probability_imputation.csv"
df_prob_imp = df.copy()

for var in vars:
    # Get the distribution of non-missing values
    freq_table = df_prob_imp[var].value_counts(dropna=True)
    distinct_vals = freq_table.index.to_list()
    probabilities = (freq_table / freq_table.sum()).to_list()

    print(f'Variable {var}: values {distinct_vals} probability {probabilities}')
    
    # Identify which rows are missing
    missing_mask = df_prob_imp[var].isna()
    n_missing = missing_mask.sum()
    
    # Randomly sample from the distinct values, using the same distribution
    random_draws = np.random.choice(distinct_vals, size=n_missing, p=probabilities)
    
    # Fill in the missing values
    df_prob_imp.loc[missing_mask, var] = random_draws

df_prob_imp.to_csv(output_file, index=False)

Variable X1: values [0.265978614986373, -1.843208311699574, -0.0324912859077737, -2.23665737973322, 0.4404733707861265, -0.8375916713489665, -0.9484912012150364, 0.0372652107755352, -1.341261288276722, -0.4639503240346658, -1.5707434577257964, -0.695416048685149, 1.926742081760706, 0.5838528955075006, 1.8839419843787195, 1.213911092262495, 0.5817640716708179, -2.56631492355811, 0.3272863516786388, 0.9297102203981537, -1.567466239448508, -1.8984064847011335, 0.1451847882617602, 0.5105098698602727, -0.6515843085878045, 0.9086893015462982, -0.7691523264275874, -0.405435100872383, -1.3722455121415869, 0.4084279672018622, -0.97956206577035, -0.9914040343732246, -0.7906254493372935, -0.6469538437936735, -0.1957433528715078, 0.3734331387017414, 0.4611743760176721, 0.1753693330712442, 0.4853258491002012, 0.6159615544677999, -0.6476536445168745, -1.464237107190103, 0.1789545169673645, 0.304386621829455, 1.619616857274999, 0.5321424582973115, -0.019143310018485, 0.2640306104420108, -1.1268685089

# Logistic Regression Imputation (binary only)

In [19]:
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_logreg_imputation.csv"

imputer = IterativeImputer(
    estimator=LogisticRegression(),
    max_iter=10,
    random_state=0
)

df_imputed_array = imputer.fit_transform(df)

df_logreg_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)
df_logreg_imputed.to_csv(output_file, index=False)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

# Linear Regression Imputation (continuous only)

In [30]:
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_linreg_imputation.csv"

imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=10,
    random_state=42
)

df_imputed_array = imputer.fit_transform(df)

df_linreg_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)
df_linreg_imputed.to_csv(output_file, index=False)



# kNN Imputation

In [None]:
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_kNN_imputation.csv"

# Create and fit the KNNImputer
imputer = KNNImputer(n_neighbors=4)
imputed_data = imputer.fit_transform(df)

df_kNN_imputed = pd.DataFrame(imputed_data, columns=df.columns)

In [None]:
# Force results to be binary (only for one hot encoded data)
binary_cols = ["X1","X2","X3","X4","X5","X6","X7"]

col_prevalences = {}
for col in binary_cols:
    valid_values = df[col].dropna()  # original data, ignoring missing
    if len(valid_values) == 0:
        # If an entire column was missing, we default its prevalence to 0.5
        col_prevalences[col] = 0.5
    else:
        # Mean of binary values = fraction of ones
        col_prevalences[col] = valid_values.mean()

# Threshold only the binary columns
for col in binary_cols:
    is_half = df_kNN_imputed[col] == 0.5
    
    # Values < 0.5 -> 0
    df_kNN_imputed.loc[df_kNN_imputed[col] < 0.5, col] = 0
    
    # Values > 0.5 -> 1
    df_kNN_imputed.loc[df_kNN_imputed[col] > 0.5, col] = 1
    
    # Values == 0.5 -> break tie based on prevalence in original data
    if col_prevalences[col] >= 0.5:
        df_kNN_imputed.loc[is_half, col] = 1
    else:
        df_kNN_imputed.loc[is_half, col] = 0
    
    # Convert to integer
    df_kNN_imputed[col] = df_kNN_imputed[col].astype(int)

In [None]:
df_kNN_imputed.to_csv(output_file, index=False)