In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from scipy.stats import mode
import faiss
from tqdm.notebook import tqdm
from sklearn.impute import SimpleImputer
from treeple import UnsupervisedRandomForest

In [10]:
input_file = "Data/Simulated Data/New Simulated Data t=21/simple_gen_missing_data_new.csv"

df = pd.read_csv(input_file)
vars = df.columns[1:]

# Mode Imputation

In [5]:
# Imputation by most common value per variable (aka popularity)
output_file = "Simulated Data/simple_gen_mode_impute_new.csv"
df_popular_imp = df.copy()

for var in vars:
    mode_value = df_popular_imp[var].mode()[0]
    print(f'Variable {var}: mode value {mode_value}')
    df_popular_imp[var].fillna(mode_value, inplace=True)

df_popular_imp.to_csv(output_file, index=False, na_rep='')

Variable X1: mode value 0.0
Variable X2: mode value 0.0
Variable X3: mode value 0.0
Variable X4: mode value 0.0
Variable X5: mode value 0.0
Variable X6: mode value 0.0
Variable X7: mode value 0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_popular_imp[var].fillna(mode_value, inplace=True)


# Imputation by Prevalence

In [6]:
# Imputation by probability distribution of existing values
output_file = "Simulated Data/simple_gen_prevalence_impute_new.csv"
df_prob_imp = df.copy()

for var in vars:
    # Get the distribution of non-missing values
    freq_table = df_prob_imp[var].value_counts(dropna=True)
    distinct_vals = freq_table.index.to_list()
    probabilities = (freq_table / freq_table.sum()).to_list()

    print(f'Variable {var}: values {distinct_vals} probability {probabilities}')
    
    # Identify which rows are missing
    missing_mask = df_prob_imp[var].isna()
    n_missing = missing_mask.sum()
    
    # Randomly sample from the distinct values, using the same distribution
    random_draws = np.random.choice(distinct_vals, size=n_missing, p=probabilities)
    
    # Fill in the missing values
    df_prob_imp.loc[missing_mask, var] = random_draws

df_prob_imp.to_csv(output_file, index=False)

Variable X1: values [0.0, 1.0] probability [0.7999652674407065, 0.20003473255929344]
Variable X2: values [0.0, 1.0] probability [0.8984598659177387, 0.10154013408226129]
Variable X3: values [0.0, 1.0] probability [0.8230522473360127, 0.17694775266398732]
Variable X4: values [0.0, 1.0] probability [0.8911992869180095, 0.1088007130819905]
Variable X5: values [0.0, 1.0] probability [0.9105684441777443, 0.08943155582225568]
Variable X6: values [0.0, 1.0] probability [0.9068353343745902, 0.09316466562540984]
Variable X7: values [0.0, 1.0] probability [0.8930360977605072, 0.10696390223949279]


# Logistic Regression Imputation (binary only)

In [7]:
output_file = "Simulated Data/simple_gen_logreg_impute_new.csv"

imputer = IterativeImputer(
    estimator=LogisticRegression(),
    max_iter=10,
    random_state=0
)

df_imputed_array = imputer.fit_transform(df)

df_logreg_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)
df_logreg_imputed.to_csv(output_file, index=False)

# Linear Regression Imputation (continuous only)

In [30]:
output_file = "Simulated Data/Normal (Non-Categorical)/non_categorical_gen_linreg_imputation.csv"

imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=10,
    random_state=42
)

df_imputed_array = imputer.fit_transform(df)

df_linreg_imputed = pd.DataFrame(df_imputed_array, columns=df.columns)
df_linreg_imputed.to_csv(output_file, index=False)



# kNN Imputation

In [11]:
# ------------- USER PARAMETERS -------------
n_subsamples = 0
sample_with_replacement = False
k_values = [3, 5, 7]
binary_cols = ["X1", "X2", "X3", "X4", "X5", "X6", "X7"]

# Note how the templates include both {n_subsamples} and {sub_i}
partial_output_template = (
    "Data/Simulated Data/New Simulated Data t=21/kNN/{n_subsamples}k3,5/simple_gen_kNNavg{sub_i}k3,5,7_impute_partial.csv"
)
final_output_template = (
    "Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavgk3,5,7_impute_binary.csv"
)

# We must format the final_output_template with n_subsamples
final_output_file = final_output_template.format(n_subsamples=n_subsamples)

# ------------- PRECOMPUTE COLUMN MODES FOR TIE-BREAKING -------------
col_modes = {}
for col in binary_cols:
    if df[col].notna().sum() == 0:
        col_modes[col] = 0
    else:
        col_modes[col] = df[col].mode().iloc[0]

# ------------- HELPER: FORCE TO BINARY -------------
def force_to_binary(imputed_array, df_reference, binary_cols, col_modes):
    for col in binary_cols:
        col_idx = df_reference.columns.get_loc(col)
        col_values = imputed_array[:, col_idx]

        mask_lt = col_values < 0.5
        mask_gt = col_values > 0.5
        mask_eq = np.isclose(col_values, 0.5)

        col_values[mask_lt] = 0
        col_values[mask_gt] = 1
        col_values[mask_eq] = col_modes[col]

        imputed_array[:, col_idx] = col_values

    return imputed_array

# ------------- MAIN LOGIC -------------
all_imputations = []  # Collect all imputed arrays for final averaging

if n_subsamples == 0:
    # NO SUBSAMPLING
    imputed_list = []
    for k in tqdm(k_values, desc="Imputing on full dataset (no subsampling)"):
        imputer = KNNImputer(n_neighbors=k)
        imputer.fit(df)
        imputed_data = imputer.transform(df)
        imputed_list.append(imputed_data)

    avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)
    avg_imputed_data = force_to_binary(avg_imputed_data, df, binary_cols, col_modes)
    df_imputed_final = pd.DataFrame(avg_imputed_data, columns=df.columns)

    df_imputed_final.to_csv(final_output_file, index=False)
    print(f"Saved final (no subsampling) imputed CSV to {final_output_file}")

else:
    # WITH SUBSAMPLING
    for sub_i in tqdm(range(1, n_subsamples + 1), desc="Subsamples"):
        subsample = df.groupby('t', group_keys=False).apply(
            lambda x: x.sample(frac=1, replace=sample_with_replacement)
        )

        sub_imputed_list = []

        for k in tqdm(k_values, desc=f"k-values for Subsample {sub_i}", leave=False):
            imputer = KNNImputer(n_neighbors=k)
            imputer.fit(subsample)
            imputed_data = imputer.transform(df)

            sub_imputed_list.append(imputed_data)
            all_imputations.append(imputed_data)

        # Average across all k-values for this subsample
        sub_avg_imputed_data = np.mean(np.stack(sub_imputed_list, axis=0), axis=0)
        sub_avg_imputed_data = force_to_binary(sub_avg_imputed_data, df, binary_cols, col_modes)

        df_sub_imputed = pd.DataFrame(sub_avg_imputed_data, columns=df.columns)

        # Now format the partial_output_template with BOTH sub_i and n_subsamples
        partial_output_file = partial_output_template.format(
            n_subsamples=n_subsamples,
            sub_i=sub_i
        )
        df_sub_imputed.to_csv(partial_output_file, index=False)

        print(f"Saved partial imputed dataset for subsample {sub_i} -> {partial_output_file}")

    # FINAL AVERAGE ACROSS ALL SUBSAMPLES & K-VALUES
    final_avg_imputed_data = np.mean(np.stack(all_imputations, axis=0), axis=0)
    final_avg_imputed_data = force_to_binary(final_avg_imputed_data, df, binary_cols, col_modes)

    df_imputed_final = pd.DataFrame(final_avg_imputed_data, columns=df.columns)
    df_imputed_final.to_csv(final_output_file, index=False)
    print(f"Saved final averaged imputed CSV to {final_output_file}")

Imputing on full dataset (no subsampling):   0%|          | 0/3 [00:00<?, ?it/s]

Saved final (no subsampling) imputed CSV to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavgk3,5,7_impute_binary.csv


## k=4

In [None]:
output_file = "Simulated Data/simple_gen_kNN4_impute.csv"

# Create and fit the KNNImputer
imputer = KNNImputer(n_neighbors=4)
imputed_data = imputer.fit_transform(df)

df_kNN_imputed = pd.DataFrame(imputed_data, columns=df.columns)

## Multiple k

In [3]:
output_file = "Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNN3,5_impute_binary.csv"

# Define the range of k values
k_values = [3, 5]
imputed_list = []

# Loop over each k value, impute the dataset, and store the results
for k in tqdm(k_values, desc="Imputing data"):
    imputer = KNNImputer(n_neighbors=k)
    imputed_data = imputer.fit_transform(df)
    imputed_list.append(imputed_data)

# Stack all imputed arrays along a new axis and compute the cell-wise average
avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)

binary_cols = ["X1", "X2", "X3", "X4", "X5", "X6", "X7"]

# Force values to be binary for specified columns
for col in binary_cols:
    # Compute the majority value (mode) for the current column in the original dataframe
    majority_val = df[col].mode().iloc[0]  # In case of ties, mode() returns the first encountered value
    col_idx = df.columns.get_loc(col)      # Get the index of the column in the dataframe
    col_values = avg_imputed_data[:, col_idx]
    
    # Create masks for values: less than 0.5, greater than 0.5, and those equal to 0.5 (with tolerance)
    mask_lt = col_values < 0.5
    mask_gt = col_values > 0.5
    mask_eq = np.isclose(col_values, 0.5)
    
    # Apply threshold rules: <0.5 becomes 0, >0.5 becomes 1, exactly 0.5 becomes the majority value
    col_values[mask_lt] = 0
    col_values[mask_gt] = 1
    col_values[mask_eq] = majority_val
    
    # Replace the column in the averaged array with the updated values
    avg_imputed_data[:, col_idx] = col_values

# Create a DataFrame from the averaged imputed data
df_kNN_imputed = pd.DataFrame(avg_imputed_data, columns=df.columns)

# Save the imputed DataFrame to the output file path
df_kNN_imputed.to_csv(output_file, index=False)

Imputing data:   0%|          | 0/2 [00:00<?, ?it/s]

## Test kNN n subsamples, only one k

In [10]:
# Number of subsamples and range of k-values
n_subsamples = 7
k_values = range(2, 3)

# Loop over each k value
for k in tqdm(k_values):
    imputed_list = []  # List to hold the imputed arrays for the current k

    # Loop over subsamples (bootstrap replications)
    for i in range(n_subsamples):
        # Create a stratified bootstrap sample by grouping over 't'
        subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
        
        # Create and fit a KNN imputer with the current k value on the subsample
        imputer = KNNImputer(n_neighbors=k)
        imputer.fit(subsample)
        
        # Impute the entire dataset using the fitted imputer
        imputed_data = imputer.transform(df)
        
        # Append the imputed result to the list
        imputed_list.append(imputed_data)

    # Average the imputed arrays across the 10 subsamples for the current k value
    avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)

    # Force the averaged imputed values to be binary for the specified columns
    # (Assumes binary_cols is a list of column names with binary data)
    for col in binary_cols:
        # Get the majority (mode) value for the column in the original data
        majority_val = df[col].mode().iloc[0]
        col_idx = df.columns.get_loc(col)
        col_values = avg_imputed_data[:, col_idx]

        # Create boolean masks for values less than, greater than, and essentially equal to 0.5
        mask_lt = col_values < 0.5
        mask_gt = col_values > 0.5
        mask_eq = np.isclose(col_values, 0.5)

        # Apply threshold rules: <0.5 becomes 0, >0.5 becomes 1,
        # and values equal to 0.5 are set to the majority value from the original column
        col_values[mask_lt] = 0
        col_values[mask_gt] = 1
        col_values[mask_eq] = majority_val

        # Replace the column in the averaged array with the binary values
        avg_imputed_data[:, col_idx] = col_values

    # Convert the averaged imputed array into a DataFrame (with original column names)
    df_avg_imputed = pd.DataFrame(avg_imputed_data, columns=df.columns)

    # Build the output filename indicating the current k value
    output_file = f"Data/Simulated Data/New Simulated Data t=21/kNN/{n_subsamples}k 2 to 8 series/simple_gen_kNNavg{n_subsamples}k{k}_impute_binary.csv"
    df_avg_imputed.to_csv(output_file, index=False)

    # Print a message to indicate progress
    print(f"Saved averaged imputed dataset for k={k} to {output_file}")

  0%|          | 0/1 [00:00<?, ?it/s]

  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved averaged imputed dataset for k=2 to Data/Simulated Data/New Simulated Data t=21/kNN/7k 2 to 8 series/simple_gen_kNNavg7k2_impute_binary.csv


In [None]:
output_file = "Data/Simulated Data/New Simulated Data t=21/simple_gen_kNNavg10k4_impute.csv"

# Number of subsamples to generate
n_subsamples = 10
imputed_list = []

# Loop to generate n_subsamples bootstrapped (stratified by 't') samples
for i in range(n_subsamples):
    # For each unique time stamp in 't', sample with replacement
    subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
    
    # Create and fit the KNN imputer on the subsample
    imputer = KNNImputer(n_neighbors=4)
    imputer.fit(subsample)
    
    # Use the fitted imputer to impute the entire dataset
    imputed_data = imputer.transform(df)
    imputed_list.append(imputed_data)

# Stack the five imputed arrays along a new axis and compute the cell-wise average
avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)

# Create a DataFrame from the averaged imputed data
df_kNN_imputed = pd.DataFrame(avg_imputed_data, columns=df.columns)

  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


## Avg kNN across subsamples + multiple k

NEED TO REWRITE: Have it impute for each subsample across multiple k, then save each of those imputed datasets (individually, not stacked with previous). Then edit code afterwards to apply averaging or majority vote method 

In [None]:
output_file = "Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg10k26_impute_new.csv"

# Number of subsamples and range of k-values
n_subsamples = 3
k_values = range(2, 7)

imputed_list = []

# Outer loop: iterate over subsamples (stratified bootstrap by 't')
for i in range(n_subsamples):
    # For each unique time stamp in 't', sample with replacement
    subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
    
    # Inner loop: iterate over each k-value
    for k in k_values:
        imputer = KNNImputer(n_neighbors=k)
        # Fit the imputer on the subsample
        imputer.fit(subsample)
        # Use the fitted imputer to impute the entire dataset
        imputed_data = imputer.transform(df)
        
        # Append this imputed array to our growing list
        imputed_list.append(imputed_data)
    
    # After finishing all k values in a subsample, compute the average of all imputations so far
    partial_avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)
    df_partial = pd.DataFrame(partial_avg_imputed_data, columns=df.columns)
    
    # Save this partial dataset to a file
    partial_output_file = f"Data/Simulated Data/New Simulated Data t=21/kNN/Nonbinary/simple_gen_kNNavg{i+1}k26_impute_new.csv"
    df_partial.to_csv(partial_output_file, index=False)
    print(f"Saved partial dataset after subsample {i+1} to {partial_output_file}")

# Average across all subsamples
avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)
df_kNN_imputed = pd.DataFrame(avg_imputed_data, columns=df.columns)

# Write out the final result
df_kNN_imputed.to_csv(output_file, index=False)

  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 1 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg1k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 2 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg2k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 3 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg3k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 4 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg4k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 5 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg5k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 6 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg6k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 7 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg7k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 8 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg8k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 9 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg9k26_impute_new.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Saved partial dataset after subsample 10 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg10k26_impute_new.csv


### Generate binary results (loop across subsamples)

In [8]:
# Method 1: Average imputed values, then force to binary based on 0.5 threshold

binary_cols = ["X1","X2","X3","X4","X5","X6","X7"]

# Compute the prevalence (fraction of ones) for each binary column in the original data
col_prevalences = {}
for col in binary_cols:
    valid_values = df[col].dropna()
    if len(valid_values) == 0:
        col_prevalences[col] = 0.5  # Default prevalence if the column is completely missing
    else:
        col_prevalences[col] = valid_values.mean()

# Loop over subsample indices 1 to 10 and process each CSV file
for i in range(1, 11):
    # Construct the input file path for the nonbinary imputed CSV
    input_file = f"Data/Simulated Data/New Simulated Data t=21/kNN/Nonbinary/simple_gen_kNNavg{i}k26_impute_new.csv"
    # Read the imputed CSV (nonbinary version)
    df_kNN_imputed = pd.read_csv(input_file)
    
    # Apply thresholding to force binary values on the binary columns
    for col in binary_cols:
        # Identify rows where the imputed value is exactly 0.5 (tie)
        is_half = df_kNN_imputed[col] == 0.5
        
        # Force values: below 0.5 become 0, above 0.5 become 1
        df_kNN_imputed.loc[df_kNN_imputed[col] < 0.5, col] = 0
        df_kNN_imputed.loc[df_kNN_imputed[col] > 0.5, col] = 1
        
        # For values equal to 0.5, break tie based on the original column prevalence
        if col_prevalences[col] >= 0.5:
            df_kNN_imputed.loc[is_half, col] = 1
        else:
            df_kNN_imputed.loc[is_half, col] = 0
        
        # Ensure the column is of integer type
        df_kNN_imputed[col] = df_kNN_imputed[col].astype(int)
    
    # Define the output file path for the binary (thresholded) dataset.
    output_file = f"Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg{i}k26_impute_binary.csv"
    # Save the processed DataFrame to CSV
    df_kNN_imputed.to_csv(output_file, index=False)
    print(f"Saved binary threshold imputed dataset for subsample {i} to {output_file}")

Saved binary threshold imputed dataset for subsample 1 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg1k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 2 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg2k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 3 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg3k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 4 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg4k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 5 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg5k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 6 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg6k26_impute_binary.csv
Saved binary threshold imputed dataset for subsample 7 to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNavg7

In [None]:
# Method 2: Force all imputed subsamples to binary, then do majority vote per cell

binary_cols = ["X1","X2","X3","X4","X5","X6","X7"]

# Compute the prevalence (fraction of ones) for each binary column in the original data
col_prevalences = {}
for col in binary_cols:
    valid_values = df[col].dropna()
    if len(valid_values) == 0:
        col_prevalences[col] = 0.5  # Default prevalence if the column is completely missing
    else:
        col_prevalences[col] = valid_values.mean()

# Each subsample produces 5 imputations (since len(k_values)==5)
k_per_subsample = len(k_values)

# Loop over the number of subsamples (1 to n_subsamples)
for i in range(1, n_subsamples + 1):
    # For i subsamples, take the first i*5 imputations from imputed_list
    num_imputations = i * k_per_subsample
    subset_imputed_list = imputed_list[:num_imputations]
    
    # Stack the selected imputations into a 3D array:
    # Shape: (num_imputations, n_rows, n_columns)
    imputed_array_subset = np.stack(subset_imputed_list, axis=0)
    
    # Start with the average imputed data for non-binary columns
    df_majority_partial = pd.DataFrame(np.mean(imputed_array_subset, axis=0), columns=df.columns)
    
    # Apply majority vote for each binary column:
    for col in binary_cols:
        # Get the column index in the dataframe
        col_idx = df.columns.get_loc(col)
        # Extract imputed values for this column across all selected imputations
        col_imputations = imputed_array_subset[:, :, col_idx]
        
        # Convert continuous imputed values to binary votes:
        # Values < 0.5 become 0, values > 0.5 become 1,
        # and values exactly 0.5 are set to np.nan (to mark ties).
        binary_imputations = np.where(col_imputations < 0.5, 0,
                                      np.where(col_imputations > 0.5, 1, np.nan))
        
        # For tie cases (np.nan), assign based on the original prevalence:
        tie_mask = np.isnan(binary_imputations)
        binary_imputations[tie_mask] = 1 if col_prevalences[col] >= 0.5 else 0
        
        # Compute the mode (majority vote) across the imputations axis (axis=0) for each row.
        maj_vote, _ = mode(binary_imputations, axis=0, keepdims=False)
        # Update the binary column in the DataFrame with the majority vote, cast to int.
        df_majority_partial[col] = maj_vote.astype(int)
    
    # Save the resulting majority voted imputed dataset for i subsamples.
    output_file_majority = f"Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj{i}k26_impute_binary.csv"
    df_majority_partial.to_csv(output_file_majority, index=False)
    print(f"Saved majority vote imputed dataset using {i} subsample(s) to {output_file_majority}")

Saved majority vote imputed dataset using 1 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj1k26_impute_majority.csv
Saved majority vote imputed dataset using 2 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj2k26_impute_majority.csv
Saved majority vote imputed dataset using 3 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj3k26_impute_majority.csv
Saved majority vote imputed dataset using 4 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj4k26_impute_majority.csv
Saved majority vote imputed dataset using 5 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj5k26_impute_majority.csv
Saved majority vote imputed dataset using 6 subsample(s) to Data/Simulated Data/New Simulated Data t=21/kNN/simple_gen_kNNmaj6k26_impute_majority.csv
Saved majority vote imputed dataset using 7 subsample(s) to Data/Simulated Data/New Simulated Data t

# FAISS (faster kNN-like algorithm)

In [10]:
def faiss_knn_impute_ref(query, reference, k):
    n_query, n_features = query.shape
    n_ref = reference.shape[0]
    
    # Compute the column modes from the reference (ignoring NaNs)
    col_modes = np.empty(n_features)
    for j in range(n_features):
        valid_vals = reference[:, j][~np.isnan(reference[:, j])]
        if len(valid_vals) == 0:
            col_modes[j] = 0  # default if entirely missing
        else:
            # Ensure the result is always an array before indexing:
            col_modes[j] = np.atleast_1d(mode(valid_vals).mode)[0]
    
    # Fill missing values in reference with column modes for indexing
    reference_filled = np.where(np.isnan(reference), np.tile(col_modes, (n_ref, 1)), reference)
    
    # Build FAISS index (using L2 distance)
    d = n_features
    index = faiss.IndexFlatL2(d)
    reference_filled = np.ascontiguousarray(reference_filled.astype(np.float32))
    index.add(reference_filled)
    
    # Create a filled version of query for initial imputation (using column modes)
    query_filled = np.where(np.isnan(query), np.tile(col_modes, (n_query, 1)), query)
    X_imputed = query_filled.copy()
    
    # For each query row with missing values, impute missing features using weighted average
    for i in range(n_query):
        missing = np.isnan(query[i, :])
        if not np.any(missing):
            continue  # Skip rows with no missing values
        q_row = np.ascontiguousarray(query_filled[i:i+1].astype(np.float32))
        D, I = index.search(q_row, k + 1)
        if D[0, 0] == 0:
            neighbors = I[0][1:k+1]
            distances = D[0][1:k+1]
        else:
            neighbors = I[0][:k]
            distances = D[0][:k]
        for j in np.where(missing)[0]:
            neighbor_vals = []
            neighbor_weights = []
            epsilon = 1e-6
            for n, d in zip(neighbors, distances):
                if not np.isnan(reference[n, j]):
                    neighbor_vals.append(reference[n, j])
                    neighbor_weights.append(1.0 / (d + epsilon))
            if len(neighbor_vals) > 0:
                X_imputed[i, j] = np.average(neighbor_vals, weights=neighbor_weights)
            else:
                X_imputed[i, j] = col_modes[j]
    return X_imputed

In [11]:
# ----- MAIN IMPUTATION CODE -----
output_file = "Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_v3.csv"

# Number of subsamples and range of k-values
n_subsamples = 7
k_values = range(2, 7)

imputed_list = []

# Outer loop: iterate over subsamples (stratified bootstrap by 't') with a progress bar
for i in tqdm(range(n_subsamples), desc="Processing subsamples"):
    # For each unique time stamp in 't', sample with replacement
    subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))
    
    # List to hold imputed data for each k within the current subsample
    subsample_imputations = []
    
    # Inner loop: iterate over each k-value with a progress bar
    for k in tqdm(k_values, desc="Processing k-values", leave=False):
        # Use FAISS-based kNN imputation (using the subsample as reference to impute df)
        imputed_data = faiss_knn_impute_ref(df.to_numpy(), subsample.to_numpy(), k)
        subsample_imputations.append(imputed_data)
    
    # Compute the average imputation for the current subsample (averaging over k-values)
    sample_avg_imputed_data = np.mean(np.stack(subsample_imputations, axis=0), axis=0)
    
    # Save this sample's imputed data as a CSV file (without combining with previous samples)
    sample_output_file = f"Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample{i+1}_v3.csv"
    df_sample = pd.DataFrame(sample_avg_imputed_data, columns=df.columns)
    df_sample.to_csv(sample_output_file, index=False)
    print(f"Saved sample {i+1} imputed dataset to {sample_output_file}")
    
    # Append this sample's imputed data to our list
    imputed_list.append(sample_avg_imputed_data)

# After processing all subsamples, average across all samples (cell-wise average)
avg_imputed_data = np.mean(np.stack(imputed_list, axis=0), axis=0)
df_kNN_imputed = pd.DataFrame(avg_imputed_data, columns=df.columns)

# Write out the final averaged imputed result
df_kNN_imputed.to_csv(output_file, index=False)
print(f"Saved overall averaged imputed dataset to {output_file}")

Processing subsamples:   0%|          | 0/7 [00:00<?, ?it/s]

  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 1 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample1_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 2 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample2_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 3 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample3_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 4 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample4_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 5 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample5_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 6 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample6_v3.csv


  subsample = df.groupby('t', group_keys=False).apply(lambda x: x.sample(frac=1, replace=True))


Processing k-values:   0%|          | 0/5 [00:00<?, ?it/s]

Saved sample 7 imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_sample7_v3.csv
Saved overall averaged imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_v3.csv


In [12]:
# ----- BINARY THRESHOLDING -----
final_input_file = "Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_v3.csv"
final_output_file = "Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_v3_binary.csv"

# Read the final averaged imputed dataset
df_kNN_imputed = pd.read_csv(final_input_file)

# Define the binary columns (adjust these as needed)
binary_cols = ["X1", "X2", "X3", "X4", "X5", "X6", "X7"]

# Compute the prevalence (fraction of ones) for each binary column in the original data
col_prevalences = {}
for col in binary_cols:
    valid_values = df[col].dropna()  # using original df for prevalence
    if len(valid_values) == 0:
        col_prevalences[col] = 0.5  # Default if completely missing
    else:
        col_prevalences[col] = valid_values.mean()

# Apply thresholding: for each binary column, set values <0.5 to 0, >0.5 to 1, and for ties (==0.5) break based on prevalence
for col in binary_cols:
    is_half = df_kNN_imputed[col] == 0.5
    df_kNN_imputed.loc[df_kNN_imputed[col] < 0.5, col] = 0
    df_kNN_imputed.loc[df_kNN_imputed[col] > 0.5, col] = 1
    df_kNN_imputed.loc[is_half, col] = 1 if col_prevalences[col] >= 0.5 else 0
    df_kNN_imputed[col] = df_kNN_imputed[col].astype(int)

# Save the binary thresholded dataset
df_kNN_imputed.to_csv(final_output_file, index=False)
print(f"Saved binary threshold imputed dataset to {final_output_file}")

Saved binary threshold imputed dataset to Data/Simulated Data/New Simulated Data t=21/kNN/FAISS 7k26/simple_gen_FAISSavg7k26_impute_v3_binary.csv


## Non-looping k

In [None]:
# Average imputed values, then force to binary based on 0.5 threshold
df_kNN_imputed = pd.read_csv('')

binary_cols = ["X1","X2","X3","X4","X5","X6","X7"]

col_prevalences = {}
for col in binary_cols:
    valid_values = df[col].dropna()  # original data, ignoring missing
    if len(valid_values) == 0:
        # If an entire column was missing, we default its prevalence to 0.5
        col_prevalences[col] = 0.5
    else:
        # Mean of binary values = fraction of ones
        col_prevalences[col] = valid_values.mean()

# Threshold only the binary columns
for col in binary_cols:
    is_half = df_kNN_imputed[col] == 0.5
    
    # Values < 0.5 -> 0
    df_kNN_imputed.loc[df_kNN_imputed[col] < 0.5, col] = 0
    
    # Values > 0.5 -> 1
    df_kNN_imputed.loc[df_kNN_imputed[col] > 0.5, col] = 1
    
    # Values == 0.5 -> break tie based on prevalence in original data
    if col_prevalences[col] >= 0.5:
        df_kNN_imputed.loc[is_half, col] = 1
    else:
        df_kNN_imputed.loc[is_half, col] = 0
    
    # Convert to integer
    df_kNN_imputed[col] = df_kNN_imputed[col].astype(int)

df_kNN_imputed.to_csv("", index=False)

# Treeple Unsupervised Oblique Random Forest

In [None]:
output_file = "Data/Simulated Data/New Simulated Data t=21/simple_gen_geodesic_v1.csv"

X = df[vars].values

# Pre-impute missing values using the "mode" strategy
pre_imputer = SimpleImputer(strategy="most_frequent")
X_pre_imputed = pre_imputer.fit_transform(X)

# Create the imputer using treeple's implementation on the pre-imputed data
imputer = UnsupervisedRandomForest(n_estimators=10, max_depth=3, 
                                          min_samples_leaf=1, random_state=42)
X_imputed = imputer.fit_transform(X_pre_imputed)

# Convert back to a DataFrame
df_imputed = pd.DataFrame(X_imputed, columns=vars)

# Save the imputed dataset
df_imputed.to_csv(output_file, index=False)