In [6]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import MIDASpy as md
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [7]:
df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')

def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
      - Dropping rows with missing values and resetting the index.
      - Converting object columns to categorical via LabelEncoder.
      - Converting other columns to float (and then to int if >50% of values are integer-like).
      - If any numeric column (not already marked as categorical) has only 2 unique values,
        it is considered categorical and encoded.
    
    Returns:
      categorical_cols (list): List of columns encoded as categorical.
      discrete_cols (list): List of columns that are numeric and integer-like.
      cont_cols (list): List of remaining continuous numeric columns.
      df_clean (DataFrame): The preprocessed DataFrame.
      encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values and reset the index.
    df_clean = df.dropna().reset_index(drop=True)

    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Process each column.
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Mark as categorical and encode using LabelEncoder.
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float.
                df_clean[col] = df_clean[col].astype(float)
                # If >50% of values are integer-like, cast column to int.
                if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # Additionally, if any numeric column (not already marked as categorical) has only 2 unique values,
    # treat it as categorical and encode it.
    for col in df_clean.columns:
        if col not in categorical_cols and df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le

    # Continuous columns are those not marked as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.

    Parameters:
      df (pd.DataFrame): DataFrame with encoded categorical columns.
      encoders (dict): Dictionary mapping column names to their LabelEncoder.

    Returns:
      pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        # Ensure that the column is integer type before inverse transforming.
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

df.drop('نامونامخانوادگی', axis=1,inplace=True)
cont, dis, cat, df2, enc = prep(df)


In [8]:
def create_missings(df:pd.DataFrame, missingness:float, random_seed:float=96):
    # Create random missingness.
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df_missing, mask_df


def calculate_metrics(df_clean, mask_df, imputed_df, method_name, continuous_cols, discrete_cols, categorical_cols):    
    mae_list = []
    for col in continuous_cols + discrete_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            error = np.abs(imputed_df.loc[col_mask, col] - df_clean.loc[col_mask, col])
            mae_list.append(error.mean())
    overall_mae = np.mean(mae_list) if mae_list else np.nan
    
    acc_list = []
    for col in categorical_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            acc = (imputed_df.loc[col_mask, col] == df_clean.loc[col_mask, col]).mean()
            acc_list.append(acc)
    overall_acc = np.mean(acc_list) if acc_list else np.nan

    return {'Method': method_name, 'MAE': overall_mae, 'Accuracy': overall_acc}


df3, df3_mask = create_missings(df2,20)
df3

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1.0,98.0,96.0,100.0,-4.0,,,278.0,280.0,,...,41.7,,0.361,,48.28,282.0,266.0,256.0,253.0,
1,,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,1.0,101.0,102.0,,2.0,0.27,0.45,284.0,286.0,,...,28.6,0.7,0.262,2.045,54.68,185.0,,194.0,179.0,170.0
3,,90.0,88.0,92.0,-4.0,,,274.0,,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,
4,0.0,105.0,105.0,,1.0,,0.84,,288.0,295.0,...,2.9,0.1,0.304,,3.86,285.0,315.0,341.0,307.0,294.0
5,1.0,106.0,103.0,,-5.0,0.01,0.1,,310.0,,...,33.6,,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,0.0,108.0,107.0,109.0,-2.0,0.23,,285.0,288.0,283.0,...,49.5,,,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,0.0,92.0,92.0,,0.0,0.26,3.35,281.0,284.0,279.0,...,35.5,,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,1.0,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,1.0,95.0,,,-2.0,,,282.0,,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,,339.0


In [9]:
# Convert categorical columns and get categorical metadata.
md_cat_data, md_cats = md.cat_conv(df3[cat])

# Separate non-categorical (numeric) columns.
num_cols = dis + cont  # these are the numeric columns

# Drop the categorical columns from df3 and then combine with the converted categorical data.
df3.drop(cat, axis=1, inplace=True)
constructor_list = [df3]
constructor_list.append(md_cat_data)
data_in = pd.concat(constructor_list, axis=1)

# Scale non-categorical columns BEFORE imputation.
scaler = StandardScaler()
data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

# Handle missing values (if needed).
na_loc = data_in.isnull()
data_in[na_loc] = np.nan

# Build and train the imputer using the scaled data.
imputer = md.Midas(layer_structure=[256, 256], vae_layer=False, seed=89, input_drop=0.75)
# Note: Using the scaled data_in (you might want to assign it to a new variable like data_in2 if needed)
imputer.build_model(data_in, softmax_columns=md_cats)
imputer.train_model(training_epochs=20)

# Generate imputations.
imputations = imputer.generate_samples(m=10).output_list 

# Reverse transform the numerical columns in each imputed DataFrame.
for i in range(len(imputations)):
    imputations[i][num_cols] = scaler.inverse_transform(imputations[i][num_cols])
    
    # Process categorical columns:
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in md_cats]
    cat_df = pd.DataFrame({cat[j]: tmp_cat[j] for j in range(len(cat))})
    # Construct final imputed dataset by reattaching categorical data.
    flat_cats = [col for variable in md_cats for col in variable]
    imputations[i] = pd.concat([imputations[i], cat_df], axis=1).drop(flat_cats, axis=1)


Size index: [138, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 102.03511810302734
Epoch: 1 , loss: 119.28441619873047
Epoch: 2 , loss: 125.48267364501953
Epoch: 3 , loss: 116.40065002441406
Epoch: 4 , loss: 96.99713897705078
Epoch: 5 , loss: 103.977783203125
Epoch: 6 , loss: 96.29216766357422
Epoch: 7 , loss: 98.9549331665039
Epoch: 8 , loss: 93.18431091308594
Epoch: 9 , loss: 114.44400024414062
Epoch: 10 , loss: 98.850830078125
Epoch: 11 , loss: 123.07731628417969
Epoch: 12 , loss: 126.85834503173828
Epoch: 13 , loss: 115.41191101074219
Epoch: 14 , loss: 110.39813232421875
Epoch: 15 , loss: 95.76268005371094
Epoch: 16 , loss: 102.50088500976562
Epoch: 17 , loss: 109.4853286743164
Epoch: 18 , loss: 114.66378021240234
Epoch: 19 , loss: 94.5094223022461
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


In [66]:
md_cats

[['side_0.0', 'side_1.0']]

In [None]:
soft_cols = [x for sublist in md_cats for x in sublist] + dis
soft_cols

['side_0.0',
 'side_1.0',
 'gccaverarage1',
 'supgcc1',
 'infgcc1',
 'intraeye1',
 'wholethick1',
 'thicksuphemi1',
 'thickinfhem1',
 'thickfovea1',
 'thickparafove1',
 'thicksuphem1',
 'thickinfhemi1',
 'thicktempo1',
 'thicksup1',
 'thicknasal1',
 'thickinf1',
 'thickperi1',
 'thicksupe1',
 'thickinf1_A',
 'thicktempo1_A',
 'thicksup1_A',
 'thicknasal1_A',
 'thickinf1_B',
 'N1000',
 'N500',
 'CENTR',
 'T500',
 'T1000',
 'gccaverarage1_injured',
 'supgcc1_injured',
 'infgcc1_injured',
 'intraeye1_injured',
 'wholethick1_injured',
 'thicksuphemi1_injured',
 'thickinfhem1_injured',
 'thickfovea1_injured',
 'thickparafove1_injured',
 'thicksuphem1_injured',
 'thickinfhemi1_injured',
 'thicktempo1_injured',
 'thicksup1_injured',
 'thicknasal1_injured',
 'thickinf1_injured',
 'thickperi1_injured',
 'thicksupe1_injured',
 'thickinf1_injured_A',
 'thicktempo1_injured_A',
 'thicksup1_injured_A',
 'thicknasal1_injured_A',
 'thickinf1_injured_B',
 'N1000_injured',
 'N500_injured',
 'CENTR_injur

In [10]:
for i in imputations:
    print(calculate_metrics(df2, df3_mask, i, f'midaspy ', cont, dis, cat))

{'Method': 'midaspy ', 'MAE': 13.96424326714278, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.074331934768734, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.046515908752474, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.008804380955356, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.059403170007359, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.061973342315499, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.043477044241351, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 13.989785999476876, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.039249647264045, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 14.03222816855248, 'Accuracy': 0.0}


In [11]:
imputations[1]

Unnamed: 0,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,thickfovea1,...,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured,side
0,98.0,96.0,100.0,-4.0,0.087689,1.809129,278.0,280.0,279.407633,240.0,...,0.565862,0.361,2.565016,48.28,282.0,266.0,256.0,253.0,254.8694,side_1.0
1,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,252.0,...,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0,side_0.0
2,101.0,102.0,101.876612,2.0,0.27,0.45,284.0,286.0,279.554361,241.0,...,0.7,0.262,2.045,54.68,185.0,273.925072,194.0,179.0,170.0,side_1.0
3,90.0,88.0,92.0,-4.0,0.12612,2.116542,274.0,284.443302,274.0,235.0,...,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,254.770205,side_0.0
4,105.0,105.0,102.184425,1.0,0.070173,0.84,281.26554,288.0,295.0,323.0,...,0.1,0.304,2.618244,3.86,285.0,315.0,341.0,307.0,294.0,side_0.0
5,106.0,103.0,101.799055,-5.0,0.01,0.1,281.08196,310.0,280.48138,286.0,...,0.567203,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0,side_1.0
6,108.0,107.0,109.0,-2.0,0.23,1.760781,285.0,288.0,283.0,243.0,...,0.572085,0.492184,2.029,52.78,176.0,222.0,241.0,205.0,167.0,side_0.0
7,92.0,92.0,101.955636,2.220446e-16,0.26,3.35,281.0,284.0,279.0,258.0,...,0.571178,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0,side_0.0
8,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,281.151862,273.0,...,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0,side_1.0
9,95.0,99.905187,101.774283,-2.0,0.054704,1.80758,282.0,282.463267,277.0,245.0,...,0.8,0.374,2.312,59.15,336.0,352.0,386.0,267.204811,339.0,side_1.0


In [None]:
# def do_midas(df, continuous_cols=None, discrete_cols=None, categorical_cols=None):

#     # Convert categorical columns and get categorical metadata.
#     md_cat_data, md_cats = md.cat_conv(df[categorical_cols])

#     # Separate non-categorical (numeric) columns.
#     num_cols = discrete_cols + continuous_cols  # these are the numeric columns

#     # Drop the categorical columns from df3 and then combine with the converted categorical data.
    
#     df_copy = df.drop(cat, axis=1)
#     constructor_list = [df_copy]
#     constructor_list.append(md_cat_data)
#     data_in = pd.concat(constructor_list, axis=1)

#     # Scale non-categorical columns BEFORE imputation.
#     scaler = StandardScaler()
#     data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

#     # Handle missing values (if needed).
#     na_loc = data_in.isnull()
#     data_in[na_loc] = np.nan

#     # Build and train the imputer using the scaled data.
#     imputer = md.Midas(layer_structure=[256, 256], vae_layer=True, seed=89, input_drop=0.75)
    
#     # Note: Using the scaled data_in (you might want to assign it to a new variable like data_in2 if needed)
#     # soft_cols = [x for sublist in md_cats for x in sublist] + dis

#     # imputer.build_model(data_in, softmax_columns=[soft_cols])
#     imputer.build_model(data_in, softmax_columns=md_cats)

#     imputer.train_model(training_epochs=20)

#     # Generate imputations.
#     imps = imputer.generate_samples(m=10).output_list 

#     # Reverse transform the numerical columns in each imputed DataFrame.
#     for i in imps:
#         i[num_cols] = scaler.inverse_transform(i[num_cols])
        
#         # Process categorical columns:
#         tmp_cat = [i[x].idxmax(axis=1) for x in md_cats]
#         cat_df = pd.DataFrame({cat[j]: tmp_cat[j] for j in range(len(cat))})
#         # Construct final imputed dataset by reattaching categorical data.
#         flat_cats = [col for variable in md_cats for col in variable]
#         i = pd.concat([i, cat_df], axis=1).drop(flat_cats, axis=1)
#         # i[discrete_cols] = i[discrete_cols].round()
        
#     return imps 
        

In [72]:
def do_midas(df, continuous_cols=None, discrete_cols=None, categorical_cols=None):
    """
    Imputes missing values using the MIDAS model.
    
    Parameters:
      df (pd.DataFrame): Input dataframe.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but non-continuous) column names.
      categorical_cols (list): List of categorical column names.
      
    Returns:
      imps (list): A list of imputed dataframes.
    """
    # 1. Convert categorical columns and get categorical metadata.
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])
    
    # 2. Define the numeric columns.
    num_cols = discrete_cols + continuous_cols  # these are the numeric columns

    # 3. Drop original categorical columns and combine with the converted categorical data.
    df_copy = df.drop(columns=categorical_cols)
    constructor_list = [df_copy, md_cat_data]
    data_in = pd.concat(constructor_list, axis=1)
    
    # 4. Scale non-categorical columns BEFORE imputation.
    scaler = StandardScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])
    
    # (Optional) Handle missing values if needed.
    na_loc = data_in.isnull()
    data_in[na_loc] = np.nan
    
    # 5. Build and train the imputer using the scaled data.
    imputer = md.Midas(layer_structure=[256, 256], vae_layer=True, seed=89, input_drop=0.75)
    # Use md_cats as softmax columns for categorical outputs.
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)
    
    # 6. Generate imputations.
    imps = imputer.generate_samples(m=10).output_list
    
    # 7. Post-process each imputed DataFrame.
    for idx, imp_df in enumerate(imps):
        # Reverse transform the numeric columns.
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])
        
        # Process categorical columns.
        # For each softmax group in md_cats, choose the column with the highest probability.
        tmp_cat = []
        for group in md_cats:
            # idxmax returns the column name with maximum value per row for this group.
            tmp_cat.append(imp_df[group].idxmax(axis=1))
        # Assume the order of md_cats corresponds to categorical_cols.
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        
        # Drop the softmax columns.
        flat_cats = [col for group in md_cats for col in group]
        imp_df = pd.concat([imp_df, cat_df], axis=1).drop(columns=flat_cats)
        
        # Handle discrete data by rounding the values.
        imp_df[discrete_cols] = imp_df[discrete_cols].round()
        
        # Replace the processed DataFrame in the list.
        imps[idx] = imp_df
        
    return imps

In [48]:
def simulate_missingness(df, show_missingness=False):
    """
    Takes a DataFrame, calculates missingness for each column, drops all rows with any missing values (df2),
    then reintroduces missing values to df2 to match the original missingness proportions, resulting in df3.
    Also returns a mask of artificial missing values.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        show_missingness (bool): If True, prints the missingness percentage for each column 
                                 in the original DataFrame and in the simulated DataFrame.
    
    Returns:
        tuple: A tuple (df3, artificial_mask) where:
            - df3 (pd.DataFrame): A new DataFrame with simulated missingness.
            - artificial_mask (pd.DataFrame): A boolean mask indicating the positions where missing values were artificially inserted.
    """
    # 1. Calculate original missingness fraction for each column.
    missing_original = df.isna().mean()
    
    # 2. Drop all rows with missing values to create df2.
    df2 = df.dropna().reset_index(drop=True)
    
    # 3. Create df3 by copying df2.
    df3 = df2.copy()
    
    # Create a mask DataFrame with the same shape as df3 to mark artificial missing values.
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)
    
    # 4. Reintroduce missing values in df3 based on the original missingness proportions.
    for col in df3.columns:
        # Calculate the number of entries to set as missing in this column.
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            # Randomly select indices to set as missing.
            missing_indices = df3.sample(n=n_missing, random_state=42).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    # 5. Optionally print missingness for each column.
    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}%  \t -> \t df3: {missing_df3[col]*100:.2f}%")
    
    # Return the simulated DataFrame and the mask.
    return df2, df3, missing_mask

full_df, df4, df4_mask = simulate_missingness(df, show_missingness=True)


Missingness Comparison:
Column 'side': Original: 0.00%  	 -> 	 df3: 0.00%
Column 'gccaverarage1': Original: 0.00%  	 -> 	 df3: 0.00%
Column 'supgcc1': Original: 0.00%  	 -> 	 df3: 0.00%
Column 'infgcc1': Original: 0.00%  	 -> 	 df3: 0.00%
Column 'intraeye1': Original: 0.00%  	 -> 	 df3: 0.00%
Column 'flv1': Original: 11.11%  	 -> 	 df3: 9.68%
Column 'glv1': Original: 11.11%  	 -> 	 df3: 9.68%
Column 'wholethick1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thicksuphemi1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thickinfhem1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thickfovea1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thickparafove1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thicksuphem1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thickinfhemi1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thicktempo1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thicksup1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thicknasal1': Original: 4.44%  	 -> 	 df3: 3.23%
Column 'thickinf1': Origina

In [73]:
imps = do_midas(df4, cont, dis, cat)

Size index: [138, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 121.42747497558594
Epoch: 1 , loss: 140.63641357421875
Epoch: 2 , loss: 144.98468017578125
Epoch: 3 , loss: 146.0791015625
Epoch: 4 , loss: 122.92593383789062
Epoch: 5 , loss: 121.12701416015625
Epoch: 6 , loss: 121.52317810058594
Epoch: 7 , loss: 125.67375183105469
Epoch: 8 , loss: 118.1444091796875
Epoch: 9 , loss: 146.92860412597656
Epoch: 10 , loss: 127.14505004882812
Epoch: 11 , loss: 147.95372009277344
Epoch: 12 , loss: 152.55935668945312
Epoch: 13 , loss: 143.7330780029297
Epoch: 14 , loss: 139.24996948242188
Epoch: 15 , loss: 115.20101165771484
Epoch: 16 , loss: 129.68626403808594
Epoch: 17 , loss: 136.6159210205078
Epoch: 18 , loss: 139.1815185546875
Epoch: 19 , loss: 117.31334686279297
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


In [74]:
imps[3]

Unnamed: 0,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,thickfovea1,...,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured,side
0,98.0,96.0,100.0,-4.0,0.71,0.96,278.0,280.0,275.0,240.0,...,0.6,0.361,2.529,48.28,282.0,266.0,256.0,253.0,244.0,side_OS
1,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,252.0,...,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0,side_OS
2,101.0,102.0,100.0,2.0,0.27,0.45,284.0,286.0,283.0,241.0,...,0.7,0.262,2.045,54.68,185.0,182.0,194.0,179.0,170.0,side_OS
3,90.0,88.0,92.0,-4.0,0.0,4.98,274.0,274.0,274.0,235.0,...,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,261.0,side_OD
4,105.0,105.0,104.0,1.0,0.83,0.84,291.0,288.0,295.0,323.0,...,0.1,0.304,2.271,3.86,285.0,315.0,341.0,307.0,294.0,side_OD
5,106.0,103.0,108.0,-5.0,0.01,0.1,309.0,310.0,308.0,286.0,...,0.7,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0,side_OS
6,108.0,107.0,109.0,-2.0,0.23,0.24,285.0,288.0,283.0,243.0,...,0.9,0.286,2.029,52.78,176.0,222.0,241.0,205.0,167.0,side_OD
7,92.0,92.0,92.0,0.0,0.26,3.35,281.0,284.0,279.0,258.0,...,0.5,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0,side_OD
8,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,293.0,273.0,...,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0,side_OS
9,95.0,94.0,96.0,-2.0,0.12,2.82,282.0,287.0,277.0,245.0,...,0.8,0.374,2.312,59.15,336.0,352.0,386.0,345.0,339.0,side_OS


In [None]:

def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_names=None):
    """
    Select the best imputation for each column based on column-specific metrics.

    For each column that was artificially masked:
      - For continuous/discrete columns, computes the mean absolute error (MAE) between
        the imputed and original values.
      - For categorical columns, computes the accuracy (fraction of correct imputed values).
    Then, for each column, chooses the method (from the list of imputed dataframes) that has the best performance 
    (lowest MAE for continuous/discrete, highest accuracy for categorical).

    The function returns:
      1. A new dataframe built by taking the original values and replacing only the masked entries with 
         the best imputed values for that column.
      2. A summary table (DataFrame) with columns: 'Column', 'Data Type', 'Best Method', 'Metric'
         where 'Metric' is the MAE (for continuous/discrete) or Accuracy (for categorical).

    Parameters:
        imputed_dfs (list of pd.DataFrame): List of dataframes with imputed values.
        original_df (pd.DataFrame): The original (complete) dataframe used for comparison.
        mask_df (pd.DataFrame): Boolean mask indicating the positions where missing values were artificially inserted.
        continuous_cols (list): List of continuous column names.
        discrete_cols (list): List of discrete (numeric but non-continuous) column names.
        categorical_cols (list): List of categorical column names.
        method_names (list, optional): List of names for the imputation methods. If not provided,
                                       default names ("Method 1", "Method 2", …) will be used.

    Returns:
        tuple: (best_imputed_df, summary_table)
            - best_imputed_df (pd.DataFrame): DataFrame containing the best imputed values for each column.
            - summary_table (pd.DataFrame): DataFrame summarizing for each column the best method and corresponding metric.
    """
    n_methods = len(imputed_dfs)
    if method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    # Dictionary to store best method index per column.
    best_method_per_col = {}
    summary_list = []
    
    # Iterate through each column.
    for col in original_df.columns:
        # Determine the data type based on provided lists.
        if col in continuous_cols:
            col_data_type = "Continuous"
        elif col in discrete_cols:
            col_data_type = "Discrete"
        elif col in categorical_cols:
            col_data_type = "Categorical"
        else:
            col_data_type = str(original_df[col].dtype)
        
        # Only evaluate columns that had artificial missing values.
        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_data_type,
                'Best Method': None,
                'Metric': np.nan
            })
            continue
        
        # List to store metrics for each method for this column.
        metrics = []
        
        if col in continuous_cols or col in discrete_cols:
            for df_imp in imputed_dfs:
                # Using bracket notation for boolean indexing.
                error = np.abs(df_imp[col][mask_df[col]] - original_df[col][mask_df[col]])
                mae = error.mean() if not error.empty else np.nan
                metrics.append(mae)
            best_idx = np.nanargmin(metrics)
            best_metric = metrics[best_idx]
        elif col in categorical_cols:
            for df_imp in imputed_dfs:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                acc = correct.mean() if not correct.empty else np.nan
                metrics.append(acc)
            best_idx = np.nanargmax(metrics)
            best_metric = metrics[best_idx]
        else:
            best_idx = None
            best_metric = np.nan
        
        best_method = method_names[best_idx] if best_idx is not None else None
        best_method_per_col[col] = best_idx
        
        summary_list.append({
            'Column': col,
            'Data Type': col_data_type,
            'Best Method': best_method,
            'Metric': best_metric
        })
    
    summary_table = pd.DataFrame(summary_list)
    
    # Build best-imputed dataframe by replacing only the masked entries with the best imputed values.
    best_imputed_df = original_df.copy()
    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]
    
    return best_imputed_df, summary_table


In [49]:
best, summary = select_best_imputations(imps, full_df, df4_mask,cont, dis, cat)

In [50]:
summary

Unnamed: 0,Column,Data Type,Best Method,Metric
0,side,Categorical,,
1,gccaverarage1,Discrete,,
2,supgcc1,Discrete,,
3,infgcc1,Discrete,,
4,intraeye1,Discrete,,
...,...,...,...,...
134,N1000_injured,Discrete,Method 7,13.727562
135,N500_injured,Discrete,Method 10,12.126349
136,CENTR_injured,Discrete,Method 8,8.322490
137,T500_injured,Discrete,Method 8,0.174727


In [54]:
best[dis] = best[dis].round()
best

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,OS,98,96,100,-4,0.71,0.96,278.0,280.0,275.0,...,41.7,0.6,0.361,2.529,48.28,282.0,266.0,256.0,253.0,244.0
1,OS,105,104,106,-2,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,OS,101,102,100,2,0.27,0.45,284.0,286.0,283.0,...,28.6,0.7,0.262,2.045,54.68,185.0,182.0,194.0,179.0,170.0
3,OD,90,88,92,-4,0.0,4.98,274.0,274.0,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,261.0
4,OD,105,105,104,1,0.83,0.84,291.0,288.0,295.0,...,2.9,0.1,0.304,2.271,3.86,285.0,315.0,341.0,307.0,294.0
5,OS,106,103,108,-5,0.01,0.1,309.0,310.0,308.0,...,33.6,0.7,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,OD,108,107,109,-2,0.23,0.24,285.0,288.0,283.0,...,49.5,0.9,0.286,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,OD,92,92,92,0,0.26,3.35,281.0,284.0,279.0,...,35.5,0.5,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,OS,103,100,106,-6,0.05,0.29,294.0,294.0,293.0,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,OS,95,94,96,-2,0.12,2.82,282.0,287.0,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,345.0,339.0
