In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import MIDASpy as md
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')

def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
      - Dropping rows with missing values and resetting the index.
      - Converting object columns to categorical via LabelEncoder.
      - Converting other columns to float (and then to int if >50% of values are integer-like).
      - If any numeric column (not already marked as categorical) has only 2 unique values,
        it is considered categorical and encoded.
    
    Returns:
      categorical_cols (list): List of columns encoded as categorical.
      discrete_cols (list): List of columns that are numeric and integer-like.
      cont_cols (list): List of remaining continuous numeric columns.
      df_clean (DataFrame): The preprocessed DataFrame.
      encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values and reset the index.
    df_clean = df.dropna().reset_index(drop=True)

    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Process each column.
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Mark as categorical and encode using LabelEncoder.
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float.
                df_clean[col] = df_clean[col].astype(float)
                # If >50% of values are integer-like, cast column to int.
                if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # Additionally, if any numeric column (not already marked as categorical) has only 2 unique values,
    # treat it as categorical and encode it.
    for col in df_clean.columns:
        if col not in categorical_cols and df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le

    # Continuous columns are those not marked as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.

    Parameters:
      df (pd.DataFrame): DataFrame with encoded categorical columns.
      encoders (dict): Dictionary mapping column names to their LabelEncoder.

    Returns:
      pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        # Ensure that the column is integer type before inverse transforming.
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

df.drop('نامونامخانوادگی', axis=1,inplace=True)
cont, dis, cat, df2, enc = prep(df)


In [None]:
df2

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,0.71,0.96,278,280,275,...,41.7,0.6,0.361,2.529,48.28,282,266,256,253,244
1,1,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,100,2,0.27,0.45,284,286,283,...,28.6,0.7,0.262,2.045,54.68,185,182,194,179,170
3,0,90,88,92,-4,0.0,4.98,274,274,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,261
4,0,105,105,104,1,0.83,0.84,291,288,295,...,2.9,0.1,0.304,2.271,3.86,285,315,341,307,294
5,1,106,103,108,-5,0.01,0.1,309,310,308,...,33.6,0.7,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.24,285,288,283,...,49.5,0.9,0.286,2.029,52.78,176,222,241,205,167
7,0,92,92,92,0,0.26,3.35,281,284,279,...,35.5,0.5,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,293,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,94,96,-2,0.12,2.82,282,287,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,345,339


In [None]:
cat

['side']

In [None]:
df[['side','flv1',
 'glv1',
 'supwholevd1',
 'supvdsup1',
 'supvdinf1',
 'supvdfo1',
 'supvdpa1',
 'supvdsup1_A',
 'supvdinf1_A',
 'supvdtem1',
 'supvdsup1_B',
 'supvdnasal1',
 'supvdinfe1']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   side         45 non-null     object 
 1   flv1         40 non-null     float64
 2   glv1         40 non-null     float64
 3   supwholevd1  43 non-null     float64
 4   supvdsup1    43 non-null     float64
 5   supvdinf1    43 non-null     float64
 6   supvdfo1     43 non-null     float64
 7   supvdpa1     43 non-null     float64
 8   supvdsup1_A  43 non-null     float64
 9   supvdinf1_A  43 non-null     float64
 10  supvdtem1    43 non-null     float64
 11  supvdsup1_B  43 non-null     float64
 12  supvdnasal1  43 non-null     float64
 13  supvdinfe1   43 non-null     float64
dtypes: float64(13), object(1)
memory usage: 5.0+ KB


In [None]:
cont

['flv1',
 'glv1',
 'supwholevd1',
 'supvdsup1',
 'supvdinf1',
 'supvdfo1',
 'supvdpa1',
 'supvdsup1_A',
 'supvdinf1_A',
 'supvdtem1',
 'supvdsup1_B',
 'supvdnasal1',
 'supvdinfe1',
 'supvdperi1',
 'supvdsup1_C',
 'supvdinf1_B',
 'supvdtempo1',
 'supvdsup1_D',
 'supvdnasal1_A',
 'supvdinf1_C',
 'deepvdwh1',
 'deepvdsup1',
 'deepvdinf1',
 'deepvdfo1',
 'deepvdpara1',
 'deepvdsup1_A',
 'deepvdinf1_A',
 'deepvdtempo1',
 'deepvdsupO1',
 'deepvdnasal1',
 'deepvdinf1_B',
 'deepvdperi1',
 'deepvdsup1_B',
 'deepvdinf1_C',
 'deepvdt1',
 'deepvdsup1_C',
 'deepvdnasal1_A',
 'deepvdinf1_D',
 'SSI1',
 'FAZ1',
 'PERIM1',
 'FD1',
 'flv1_injured',
 'glv1_injured',
 'supwholevd1_injured',
 'supvdsup1_injured',
 'supvdinf1_injured',
 'supvdfo1_injured',
 'supvdpa1_injured',
 'supvdsup1_injured_A',
 'supvdinf1_injured_A',
 'supvdtem1_injured',
 'supvdsup1_injured_B',
 'supvdnasal1_injured',
 'supvdinfe1_injured',
 'supvdperi1_injured',
 'supvdsup1_injured_C',
 'supvdinf1_injured_B',
 'supvdtempo1_injured'

In [None]:
def create_missings(df:pd.DataFrame, missingness:float, random_seed:float=96):
    # Create random missingness.
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df


In [None]:
def calculate_metrics(df_clean, mask_df, imputed_df, method_name, continuous_cols, discrete_cols, categorical_cols):    
    mae_list = []
    for col in continuous_cols + discrete_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            error = np.abs(imputed_df.loc[col_mask, col] - df_clean.loc[col_mask, col])
            mae_list.append(error.mean())
    overall_mae = np.mean(mae_list) if mae_list else np.nan
    
    acc_list = []
    for col in categorical_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            acc = (imputed_df.loc[col_mask, col] == df_clean.loc[col_mask, col]).mean()
            acc_list.append(acc)
    overall_acc = np.mean(acc_list) if acc_list else np.nan

    return {'Method': method_name, 'MAE': overall_mae, 'Accuracy': overall_acc}

In [None]:
df3, df3_mask = create_missings(df2,20)
df3

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1.0,98.0,96.0,100.0,-4.0,,,278.0,280.0,,...,41.7,,0.361,,48.28,282.0,266.0,256.0,253.0,
1,,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,1.0,101.0,102.0,,2.0,0.27,0.45,284.0,286.0,,...,28.6,0.7,0.262,2.045,54.68,185.0,,194.0,179.0,170.0
3,,90.0,88.0,92.0,-4.0,,,274.0,,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,
4,0.0,105.0,105.0,,1.0,,0.84,,288.0,295.0,...,2.9,0.1,0.304,,3.86,285.0,315.0,341.0,307.0,294.0
5,1.0,106.0,103.0,,-5.0,0.01,0.1,,310.0,,...,33.6,,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,0.0,108.0,107.0,109.0,-2.0,0.23,,285.0,288.0,283.0,...,49.5,,,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,0.0,92.0,92.0,,0.0,0.26,3.35,281.0,284.0,279.0,...,35.5,,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,1.0,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,1.0,95.0,,,-2.0,,,282.0,,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,,339.0


In [None]:
df3_mask

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,False,False,False,False,False,True,True,False,False,True,...,False,True,False,True,False,False,False,False,False,True
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,True,False,False,False,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,True,False,True,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
5,False,False,False,True,False,False,False,True,False,True,...,False,True,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,True,False,False,False,...,False,True,True,False,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
9,False,False,True,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,True,False


In [None]:
def knn_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5):
    """
    Impute missing values in a DataFrame using KNN imputation.
    
    Assumes:
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.
      - Categorical columns have been label encoded using sklearn's LabelEncoder 
        (with missing values represented as np.nan).
    
    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      n_neighbors (int): Number of neighbors for KNN imputation.
    
    Returns:
      pd.DataFrame: New DataFrame with imputed values.
    """
    df_imputed = df.copy()
    
    # Impute continuous columns
    if continuous_cols:
        imputer_cont = KNNImputer(n_neighbors=n_neighbors)
        df_imputed[continuous_cols] = imputer_cont.fit_transform(df_imputed[continuous_cols])
    
    # Impute discrete columns and round to integer
    if discrete_cols:
        imputer_disc = KNNImputer(n_neighbors=n_neighbors)
        imputed_disc = imputer_disc.fit_transform(df_imputed[discrete_cols])
        df_imputed[discrete_cols] = np.round(imputed_disc).astype(int)
    
    # Impute categorical columns (assumed to be label encoded)
    if categorical_cols:
        imputer_cat = KNNImputer(n_neighbors=n_neighbors)
        imputed_cat = imputer_cat.fit_transform(df_imputed[categorical_cols])
        df_imputed[categorical_cols] = np.round(imputed_cat).astype(int)
    
    return df_imputed

knn_imputed = knn_impute_df(df3, continuous_cols=cont, discrete_cols=dis, categorical_cols=cat)
knn_imputed

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,0.26,1.29,278,280,286,...,41.7,0.68,0.361,2.1812,48.28,282,266,256,253,241
1,0,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,106,2,0.27,0.45,284,286,284,...,28.6,0.7,0.262,2.045,54.68,185,214,194,179,170
3,0,90,88,92,-4,0.232,4.484,274,283,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,283
4,0,105,105,99,1,0.378,0.84,288,288,295,...,2.9,0.1,0.304,4.3108,3.86,285,315,341,307,294
5,1,106,103,109,-5,0.01,0.1,291,310,294,...,33.6,0.7,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.34,285,288,283,...,49.5,0.68,0.2652,2.029,52.78,176,222,241,205,167
7,0,92,92,106,0,0.26,3.35,281,284,279,...,35.5,0.72,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,288,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,98,99,-2,0.268,1.228,282,288,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,324,339


In [None]:
calculate_metrics(df2, df3_mask, knn_imputed,'knn_5n',cont, dis, cat)

{'Method': 'knn_5n', 'MAE': 10.462393108622457, 'Accuracy': 0.375}

In [None]:
def mice_forest_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=10, strat='normal'):
    """
    Impute missing values in a DataFrame using the MICE forest method.
    
    Assumes:
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.
      - Categorical columns have been label encoded using sklearn's LabelEncoder 
        (with missing values represented as np.nan).
      
    Note:
      - The parameter n_neighbors is repurposed as the number of MICE iterations.
    
    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      iters (int): Number of MICE iterations.
      strat: ['normal', 'shap', 'fast'] or a dictionary 
    
    Returns:
      pd.DataFrame: New DataFrame with imputed values. Discrete and categorical 
                  columns are rounded to integers.
    """
    # Create a copy of the DataFrame to avoid modifying the original data.
    df_imputed = df.copy()
    
    # Create an imputation kernel using miceforest.
    # The entire DataFrame is used here since all columns are numeric after preprocessing.
    kernel = mf.ImputationKernel(
        df_imputed,
        random_state=0, 
        mean_match_strategy=strat
    )
    
    # Run the MICE algorithm for the specified number of iterations.
    kernel.mice(iterations=iters)
    
    # Retrieve the completed data (imputed dataset).
    df_completed = kernel.complete_data(dataset=0)
    
    # For discrete and categorical columns, round the imputed values to integers.
    if discrete_cols:
        df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)
    if categorical_cols:
        df_completed[categorical_cols] = df_completed[categorical_cols].round().astype(int)
    
    return df_completed

mice_imputed = mice_forest_impute_df(df3, cont, dis, cat, 2, strat='shap')
mice_imputed

  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][a

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,1.12,1.26,278,280,279,...,41.7,0.3,0.361,2.132,48.28,282,266,256,253,262
1,1,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,104,2,0.27,0.45,284,286,277,...,28.6,0.7,0.262,2.045,54.68,185,225,194,179,170
3,0,90,88,92,-4,0.37,5.92,274,281,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,271
4,0,105,105,102,1,0.03,0.84,296,288,295,...,2.9,0.1,0.304,2.271,3.86,285,315,341,307,294
5,1,106,103,104,-5,0.01,0.1,294,310,295,...,33.6,0.6,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.1,285,288,283,...,49.5,0.8,0.347,2.029,52.78,176,222,241,205,167
7,0,92,92,94,0,0.26,3.35,281,284,279,...,35.5,0.4,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,295,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,91,92,-2,0.23,1.08,282,288,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,307,339


In [None]:
calculate_metrics(df2, df3_mask, mice_imputed, 'mice_1_2', cont, dis, cat)

{'Method': 'mice_1_2', 'MAE': 9.217934048037309, 'Accuracy': 0.625}

In [None]:
calculate_metrics(df2, df3_mask, mice_imputed, 'mice_shap_1_2', cont, dis, cat)

{'Method': 'mice_shap_1_2', 'MAE': 10.00640548079135, 'Accuracy': 0.5}

In [None]:
def missforest_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5):
    df_imputed = df.copy()
    
    imputer = MissForest(max_iter=iters, categorical=categorical_cols )
    
    df_imputed_result = imputer.fit_transform(df_imputed)
    
    # # For discrete and categorical columns, round the imputed values to integers.
    if discrete_cols:
        df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)

    return df_imputed_result

mf_imputed = missforest_impute_df(df3, cont, dis, cat )
mf_imputed

100%|██████████| 5/5 [00:30<00:00,  6.20s/it]
100%|██████████| 5/5 [00:12<00:00,  2.45s/it]


Unnamed: 0,intraeye1,deepvdinf1_injured_C,N1000,deepvdwh1_injured,supvdinf1_injured_C,N1000_injured,supvdsup1_C,deepvdsup1_injured_C,supvdsup1,supvdinf1_injured_A,...,infgcc1_injured,wholethick1_injured,T1000,supwholevd1,deepvdsupO1,supvdpa1_injured,flv1,thicktempo1_injured_A,deepvdwh1,supvdinfe1_injured
0,-4,44.8,163,46.3,44.1,282,51.2,47.7,50.7,43.4,...,104,276,185,74.199975,55.2,45.99727,0.069765,255,49.3,43.0
1,-2,43.5,305,43.9,52.7,389,52.8,43.1,53.6,53.5,...,112,314,223,54.1,57.6,52.5,0.04,273,52.1,52.4
2,2,31.7,230,42.255177,48.1,185,51.8,33.3,50.3,48.5,...,107,284,219,50.3,48.0,45.99727,0.27,273,43.4,48.1
3,-4,34.7,284,30.2,41.3,247,49.851726,30.1,38.7,34.0,...,104,276,262,74.199975,43.6,32.1,0.069765,273,47.527481,45.012155
4,1,5.7,256,5.6,48.651724,285,57.3,6.5,56.5,11.0,...,120,293,263,55.8,52.737706,45.99727,0.069765,273,43.0,8.3
5,-5,36.4,194,36.8,41.5,204,50.0,36.1,58.162061,48.3,...,104,276,185,74.199975,52.737706,47.7,0.01,291,47.9,49.4
6,-2,50.3,179,49.5,54.0,176,53.5,49.9,53.2,55.3,...,111,276,167,74.199975,54.6,55.6,0.23,273,47.527481,56.3
7,0,37.3,213,35.6,40.8,185,47.1,25.2,58.162061,39.5,...,104,284,196,74.199975,57.5,45.99727,0.26,259,49.1,37.3
8,-6,51.6,173,48.1,56.5,182,49.851726,51.5,54.4,58.7,...,104,296,223,54.6,56.2,58.5,0.05,277,55.1,45.012155
9,-2,56.1,253,53.1,53.8,336,47.7,53.1,51.0,56.3,...,104,276,247,51.1,56.0,54.5,0.069765,266,47.8,54.6


In [None]:
calculate_metrics(df2, df3_mask, mf_imputed, 'missforest', cont, dis, cat)

{'Method': 'missforest', 'MAE': 14.086886039323858, 'Accuracy': 0.375}

In [None]:
def midas_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
                    hidden_dim=64, num_layers=2, batch_size=32, epochs=100, learning_rate=0.001, **kwargs):
    """
    Impute missing values in a DataFrame using the MIDAS imputation method (a deep learning approach).

    Assumptions:
      - The DataFrame has been preprocessed so that all columns are numeric.
      - Categorical columns have been label encoded (with missing values as np.nan).
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.

    After imputation, discrete and categorical columns are rounded to integers.

    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      hidden_dim (int): Hidden layer dimension size for the MIDAS model.
      num_layers (int): Number of layers in the MIDAS model.
      batch_size (int): Batch size for training the model.
      epochs (int): Number of training epochs.
      learning_rate (float): Learning rate for the optimizer.
      **kwargs: Additional keyword arguments to pass to the MIDASImputer.

    Returns:
      pd.DataFrame: A new DataFrame with imputed values. Discrete and categorical columns
                    are rounded to integers.
    """
    # Create a copy of the DataFrame to avoid modifying the original.
    df_input = df.copy()
    # Instantiate the MIDAS imputer with provided hyperparameters.
    imputer = MIDASpy.MIDASImputer(hidden_dim=hidden_dim,
                           num_layers=num_layers,
                           batch_size=batch_size,
                           epochs=epochs,
                           learning_rate=learning_rate,
                           **kwargs)
    
    # Fit the model and transform the data.
    imputed_array = imputer.fit_transform(df_input)
    
    # Convert the imputed numpy array back to a DataFrame.
    imputed_df = pd.DataFrame(imputed_array, columns=df_input.columns, index=df_input.index)
    
    # For discrete and categorical columns, round the imputed values to ensure integer types.
    if discrete_cols:
        imputed_df[discrete_cols] = imputed_df[discrete_cols].round().astype(int)
    if categorical_cols:
        imputed_df[categorical_cols] = imputed_df[categorical_cols].round().astype(int)
    
    return imputed_df

midas_imputed = midas_impute_df(df3, cont, dis, cat)

AttributeError: module 'MIDASpy' has no attribute 'MIDASImputer'

In [None]:
df3

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1.0,98.0,96.0,100.0,-4.0,,,278.0,280.0,,...,41.7,,0.361,,48.28,282.0,266.0,256.0,253.0,
1,,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,1.0,101.0,102.0,,2.0,0.27,0.45,284.0,286.0,,...,28.6,0.7,0.262,2.045,54.68,185.0,,194.0,179.0,170.0
3,,90.0,88.0,92.0,-4.0,,,274.0,,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,
4,0.0,105.0,105.0,,1.0,,0.84,,288.0,295.0,...,2.9,0.1,0.304,,3.86,285.0,315.0,341.0,307.0,294.0
5,1.0,106.0,103.0,,-5.0,0.01,0.1,,310.0,,...,33.6,,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,0.0,108.0,107.0,109.0,-2.0,0.23,,285.0,288.0,283.0,...,49.5,,,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,0.0,92.0,92.0,,0.0,0.26,3.35,281.0,284.0,279.0,...,35.5,,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,1.0,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,1.0,95.0,,,-2.0,,,282.0,,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,,339.0


In [None]:
md_cat_data, md_cats = md.cat_conv(df3[cat])
md_cats

[['side_0.0', 'side_1.0']]

In [None]:
df3.drop(cat, axis = 1, inplace = True)
constructor_list = [df3]
constructor_list.append(md_cat_data)
data_in = pd.concat(constructor_list, axis=1)

na_loc = data_in.isnull()
data_in[na_loc] = np.nan

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pre_data_in = scaler.fit_transform (data_in)
data_in2 = pd.DataFrame(data=pre_data_in, columns=data_in.columns)


In [None]:
imputer = md.Midas(layer_structure = [256,256], vae_layer = False, seed = 89, input_drop = 0.75)
imputer.build_model(data_in2, softmax_columns = md_cats)
imputer.train_model(training_epochs = 20)

Size index: [138, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 101.48571014404297
Epoch: 1 , loss: 118.83506774902344
Epoch: 2 , loss: 125.0703125
Epoch: 3 , loss: 115.87602996826172
Epoch: 4 , loss: 96.5224380493164
Epoch: 5 , loss: 103.46436309814453
Epoch: 6 , loss: 95.80301666259766
Epoch: 7 , loss: 98.55606842041016
Epoch: 8 , loss: 92.68733978271484
Epoch: 9 , loss: 113.94678497314453
Epoch: 10 , loss: 98.35855102539062
Epoch: 11 , loss: 122.53985595703125
Epoch: 12 , loss: 126.39364624023438
Epoch: 13 , loss: 114.88636016845703
Epoch: 14 , loss: 109.83509063720703
Epoch: 15 , loss: 95.22294616699219
Epoch: 16 , loss: 102.01099395751953
Epoch: 17 , loss: 108.92523193359375
Epoch: 18 , loss: 114.14856719970703
Epoch: 19 , loss: 94.11422729492188
Training complete. Saving file...
Model saved in file: tmp/MIDAS


<MIDASpy.midas_base.Midas at 0x2b4ff292740>

In [None]:
imputations = imputer.generate_samples(m=10).output_list 


INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


In [None]:
flat_cats = [cat for variable in md_cats for cat in variable]

for i in range(len(imputations)):
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in md_cats]
    cat_df = pd.DataFrame({cat[i]:tmp_cat[i] for i in range(len(cat))})
    imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)

KeyError: "None of [Index(['side_0.0', 'side_1.0'], dtype='object')] are in the [columns]"

In [None]:
for i in imputations:
    print(calculate_metrics(df2, df3_mask, i, f'midaspy ', cont, dis, cat))

{'Method': 'midaspy ', 'MAE': 125.06993263123597, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.1145308951835, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.24579896035773, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.02126109952223, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.23089769452008, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.27815853842971, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.21989122117049, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.16082598197602, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.33422701712858, 'Accuracy': 0.0}
{'Method': 'midaspy ', 'MAE': 125.22098670842804, 'Accuracy': 0.0}


In [None]:
imputations[1]

Unnamed: 0,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,thickfovea1,...,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured,side
0,98.0,96.0,100.0,-4.0,-2.63098,1.209904,278.0,280.0,3.287605,240.0,...,1.165502,0.361,-1.257486,48.28,282.0,266.0,256.0,253.0,3.355184,side_1.0
1,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,252.0,...,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0,side_0.0
2,101.0,102.0,2.389088,2.0,0.27,0.45,284.0,286.0,-0.045044,241.0,...,0.7,0.262,2.045,54.68,185.0,3.105951,194.0,179.0,170.0,side_1.0
3,90.0,88.0,92.0,-4.0,-0.579971,1.325912,274.0,3.030002,274.0,235.0,...,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,-2.263868,side_0.0
4,105.0,105.0,-2.275623,1.0,-1.808518,0.84,1.560069,288.0,295.0,323.0,...,0.1,0.304,-0.656858,3.86,285.0,315.0,341.0,307.0,294.0,side_0.0
5,106.0,103.0,-1.655531,-5.0,0.01,0.1,2.162387,310.0,5.306025,286.0,...,1.258237,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0,side_1.0
6,108.0,107.0,109.0,-2.0,0.23,-1.148433,285.0,288.0,283.0,243.0,...,1.492795,0.300137,2.029,52.78,176.0,222.0,241.0,205.0,167.0,side_0.0
7,92.0,92.0,1.630069,0.0,0.26,3.35,281.0,284.0,279.0,258.0,...,-0.169823,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0,side_0.0
8,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,3.493186,273.0,...,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0,side_1.0
9,95.0,1.00793,0.322953,-2.0,-1.382211,3.227551,282.0,0.657689,277.0,245.0,...,0.8,0.374,2.312,59.15,336.0,352.0,386.0,-2.295553,339.0,side_1.0


In [None]:
# Convert categorical columns and get categorical metadata.
md_cat_data, md_cats = md.cat_conv(df3[cat])

# Separate non-categorical (numeric) columns.
num_cols = dis + cont  # these are the numeric columns

# Drop the categorical columns from df3 and then combine with the converted categorical data.
df3.drop(cat, axis=1, inplace=True)
constructor_list = [df3]
constructor_list.append(md_cat_data)
data_in = pd.concat(constructor_list, axis=1)

# Scale non-categorical columns BEFORE imputation.
scaler = StandardScaler()
data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

# Handle missing values (if needed).
na_loc = data_in.isnull()
data_in[na_loc] = np.nan

# Build and train the imputer using the scaled data.
imputer = md.Midas(layer_structure=[256, 256], vae_layer=False, seed=89, input_drop=0.75)
# Note: Using the scaled data_in (you might want to assign it to a new variable like data_in2 if needed)
imputer.build_model(data_in, softmax_columns=md_cats)
imputer.train_model(training_epochs=20)

# Generate imputations.
imputations = imputer.generate_samples(m=10).output_list 

# Reverse transform the numerical columns in each imputed DataFrame.
for i in range(len(imputations)):
    imputations[i][num_cols] = scaler.inverse_transform(imputations[i][num_cols])
    
    # Process categorical columns:
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in md_cats]
    cat_df = pd.DataFrame({cat[j]: tmp_cat[j] for j in range(len(cat))})
    # Construct final imputed dataset by reattaching categorical data.
    flat_cats = [col for variable in md_cats for col in variable]
    imputations[i] = pd.concat([imputations[i], cat_df], axis=1).drop(flat_cats, axis=1)


KeyError: "None of [Index(['side'], dtype='object')] are in the [columns]"

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import MIDASpy as md
from sklearn.preprocessing import MinMaxScaler
from impute import *

df = pd.read_excel(r"C:\Users\Matin\Downloads\Data for Dr.Matin.xlsx", 's1')

In [2]:
imp_test, imp_res, miss, mask = aio_custom_missingness(df, 10)

[WinError 2] The system cannot find the file specified
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
100%|██████████| 5/5 [03:58<00:00, 47.80s/it]
100%|██████████| 5/5 [00:02<00:00,  1.94it/s]


Size index: [42, 3, 1710, 2]
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.

Computation graph constructed

Model initialised

Epoch: 0 , loss: 23.660918203450866
Epoch: 1 , loss: 16.172495728832182
Epoch: 2 , loss: 14.82738814919682
Epoch: 3 , loss: 14.342451475434384
Epoch: 4 , loss: 14.000748989945752
Epoch: 5 , loss: 13.794479968184131
Epoch: 6 , loss: 13.638197082584187
Epoch: 7 , loss: 13.564650438599667
Epoch: 8 , loss: 13.478244926969884
Epoch: 9 , loss: 13.450543484445346
Epoch: 10 , loss: 13.401780839693748
Epoch: 11 , loss: 13.375366849414373
Epoch: 12 , los

In [None]:
imp_res

Unnamed: 0,Column,Data Type,Best Method,Metric,Error_SD,Max_Error,Min_Error,Within_10pct
0,n,Discrete,"MIDAS, params: samples=1 ,layer=[256, 256], va...",491.1658,254.6468,939.0,5.0,0.050251
1,Gen.code,Categorical,"MICE Forest, params: iters=10, strat=normal",0.9004975,,,,
2,ID,Categorical,"KNN, params: n_neighbors=5",0.0,,,,
3,Dm2,Discrete,"MICE Forest, params: iters=10, strat=normal",0.0964467,0.295955,1.0,0.0,0.903553
4,Dm4,Discrete,"MICE Forest, params: iters=10, strat=normal",7.369792,5.558233,33.0,0.0,0.369792
5,E11,Discrete,"MICE Forest, params: iters=10, strat=normal",5.021978,5.878095,35.0,0.0,0.791209
6,E12,Discrete,"MICE Forest, params: iters=10, strat=normal",4.835897,5.245886,34.0,0.0,0.841026
7,E21,Discrete,"MICE Forest, params: iters=10, strat=normal",0.9292929,1.764009,15.0,0.0,0.868687
8,E22,Discrete,"MICE Forest, params: iters=10, strat=normal",1.391534,6.104161,82.0,0.0,0.867725
9,E31,Discrete,"MICE Forest, params: iters=10, strat=normal",3.099526,4.455553,32.0,0.0,0.957346


In [None]:
imp_res.to_excel('summary.xlsx')

In [None]:
imp_test

Unnamed: 0,n,Gen.code,ID,Dm2,Dm4,E11,E12,E21,E22,E31,...,Neutrophils,Lymphocyte,Mixed,Platelets,DBP,SBP,gdi,work_activity,transport,lesiretime
0,1,B,711041127B,2,36,80,80,30,20,109,...,35.8,52.6,11.600000,249,67,110,6,34560,2391444,1440
1,2,A,121170301A,1,59,65,62,16,16,132,...,52.5,36.4,11.100000,226,76,128,5,7174332,360,7174332
2,3,A,112020351A,1,58,83,85,17,17,120,...,58.8,32.1,9.100000,288,70,120,5,4783128,480,7174332
3,4,B,111020061B,1,61,69,70,29,29,140,...,52.4,37.8,9.800000,240,70,135,7,4785408,2520,2520
4,5,B,721120376B,2,28,76,75,18,18,131,...,40.5,47.7,11.800000,214,83,126,7,4799688,1680,7174332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886,1887,A,721020197B,2,62,81,86,18,18,109,...,51.1,37.0,11.900000,217,74,109,3,4790448,840,4783608
1887,1888,A,121120137A,1,58,81,78,17,18,128,...,52.3,39.7,8.000000,197,74,121,5,7174332,840,7174332
1888,1889,C,112040003C,1,30,87,87,17,18,100,...,57.9,34.9,8.260671,223,60,100,5,960,240,4782888
1889,1890,A,712071514A,2,56,84,80,16,16,130,...,54.4,32.8,12.800000,229,80,125,6,4783608,1680,17280


In [None]:
mask

Unnamed: 0,n,Gen.code,ID,Dm2,Dm4,E11,E12,E21,E22,E31,...,Neutrophils,Lymphocyte,Mixed,Platelets,DBP,SBP,gdi,work_activity,transport,lesiretime
0,False,False,False,False,False,False,True,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886,False,False,True,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
1887,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
1888,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False
1889,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
import time
import numpy as np
import pandas as pd
import optuna

def hpo_imputation_optimization(df, missingness_percent=20, timeout=60):
    """
    Performs hyperparameter optimization (HPO) for imputation methods on a DataFrame.
    
    Steps:
      1. Preprocess the DataFrame using prep().
      2. Create artificial missingness using create_missings().
      3. For each column with artificial missings, run HPO (using Optuna) for each imputation method:
           - KNN: Optimize n_neighbors (3 to 15).
           - MICE: Optimize iters (5 to 20) and strat (choice of 'normal', 'shap', or 'fast').
           - MissForest: Optimize iters (3 to 10).
           - MIDAS: Optimize layer (choose between [256,256] and [512,256]) and vae (True/False).
         For numeric columns (continuous or discrete) the objective is MAE.
         For categorical (or string) columns the objective is (1 – accuracy).
      4. Enforce a global timeout (in seconds) so that if HPO runs too long, the function stops and uses the best results found.
      5. Return:
            - best_by_method: a dictionary mapping each column to a dict of method->(metric, hyperparameters)
            - best_method_for_col: a dictionary mapping each column to the best (method, hyperparameters)
            - summary_table: a DataFrame summarizing, per column, the best method and its hyperparameters.
    
    Note: This function assumes that the following helper functions are defined and available:
          prep, create_missings, do_knn, do_mice, do_mf, do_midas.
    
    Parameters:
      df (pd.DataFrame): Input DataFrame.
      missingness_percent (float): Percentage of missingness to introduce.
      timeout (float): Global timeout in seconds for the HPO process.
    
    Returns:
      best_by_method (dict), best_method_for_col (dict), summary_table (pd.DataFrame)
    """
    # Preprocess the DataFrame.
    continuous_cols, discrete_cols, categorical_cols, df_clean, encoders = prep(df)
    # Create artificial missingness.
    df_complete, df_missing, mask_df = create_missings(df_clean, missingness=missingness_percent)
    
    # Define objective functions for each method.
    def objective_knn(trial, col):
        n_neighbors = trial.suggest_int("n_neighbors", 3, 15)
        imps, _ = do_knn(df_missing, continuous_cols, discrete_cols, categorical_cols,
                         n_neighbors=n_neighbors, samples=1)
        imp_df = imps[0]
        if col in continuous_cols or col in discrete_cols:
            imp_vals = pd.to_numeric(imp_df[col][mask_df[col]], errors='coerce')
            orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            mae = np.abs(imp_vals - orig_vals).mean()
            return mae
        else:
            # For categorical, objective = 1 - accuracy.
            acc = (imp_df[col][mask_df[col]] == df_complete[col][mask_df[col]]).mean()
            return 1 - acc

    def objective_mice(trial, col):
        iters = trial.suggest_int("iters", 5, 20)
        strat = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
        imps, _ = do_mice(df_missing, continuous_cols, discrete_cols, categorical_cols,
                          iters=iters, strat=strat, samples=1)
        imp_df = imps[0]
        if col in continuous_cols or col in discrete_cols:
            imp_vals = pd.to_numeric(imp_df[col][mask_df[col]], errors='coerce')
            orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            mae = np.abs(imp_vals - orig_vals).mean()
            return mae
        else:
            acc = (imp_df[col][mask_df[col]] == df_complete[col][mask_df[col]]).mean()
            return 1 - acc

    def objective_mf(trial, col):
        iters = trial.suggest_int("iters", 3, 10)
        imps, _ = do_mf(df_missing, continuous_cols, discrete_cols, categorical_cols,
                        iters=iters, samples=1)
        imp_df = imps[0]
        if col in continuous_cols or col in discrete_cols:
            imp_vals = pd.to_numeric(imp_df[col][mask_df[col]], errors='coerce')
            orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            mae = np.abs(imp_vals - orig_vals).mean()
            return mae
        else:
            acc = (imp_df[col][mask_df[col]] == df_complete[col][mask_df[col]]).mean()
            return 1 - acc

    def objective_midas(trial, col):
        layer_option = trial.suggest_categorical("layer", [[256,256], [512,256]])
        vae = trial.suggest_categorical("vae", [True, False])
        imps, _ = do_midas(df_missing, continuous_cols, discrete_cols, categorical_cols,
                           layer=layer_option, vae=vae, samples=1)
        imp_df = imps[0]
        if col in continuous_cols or col in discrete_cols:
            imp_vals = pd.to_numeric(imp_df[col][mask_df[col]], errors='coerce')
            orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            mae = np.abs(imp_vals - orig_vals).mean()
            return mae
        else:
            acc = (imp_df[col][mask_df[col]] == df_complete[col][mask_df[col]]).mean()
            return 1 - acc

    methods = {
        "KNN": objective_knn,
        "MICE": objective_mice,
        "MissForest": objective_mf,
        "MIDAS": objective_midas
    }
    
    best_by_method = {}  # {col: {method: (best_value, best_params)}}
    best_method_for_col = {}  # {col: (best_method, best_params)}
    
    start_time = time.time()
    n_methods_total = len(methods)
    # For each column with missing entries (only consider those with any missingness)
    for col in df_complete.columns:
        if mask_df[col].sum() == 0:
            continue
        best_by_method[col] = {}
        for method_name, obj_func in methods.items():
            study = optuna.create_study(direction="minimize")
            # Use a per-study timeout as a fraction of the total timeout.
            per_study_timeout = timeout / (df_complete.shape[1] * n_methods_total)
            try:
                study.optimize(lambda trial: obj_func(trial, col), timeout=per_study_timeout)
            except Exception as e:
                pass  # In case of error, skip this method.
            if study.best_trial is not None:
                best_by_method[col][method_name] = (study.best_value, study.best_params)
            else:
                best_by_method[col][method_name] = (float('inf'), None)
        # Choose the best method for this column (lowest objective value).
        best_meth = None
        best_val = float('inf')
        best_params = None
        for meth, (val, params) in best_by_method[col].items():
            if val < best_val:
                best_val = val
                best_meth = meth
                best_params = params
        best_method_for_col[col] = (best_meth, best_params)
        # Check for global timeout.
        if time.time() - start_time > timeout:
            break

    # Build a summary table.
    summary_rows = []
    for col, meth_info in best_method_for_col.items():
        row = {
            "Column": col,
            "Best Method": meth_info[0],
            "Best Hyperparameters": meth_info[1]
        }
        # Also include metrics for each method.
        for method, (val, params) in best_by_method[col].items():
            row[f"{method}_Metric"] = val
        summary_rows.append(row)
    summary_table = pd.DataFrame(summary_rows)
    
    return best_by_method, best_method_for_col, summary_table


In [None]:
imp, dic, summary = hpo_imputation_optimization(df, 10, timeout=500)

[I 2025-03-17 16:24:39,017] A new study created in memory with name: no-name-0ddcb4a0-a1cd-42dc-ac0e-a556b554a4fe
[I 2025-03-17 16:24:40,013] Trial 0 finished with value: 517.9447236180905 and parameters: {'n_neighbors': 10}. Best is trial 0 with value: 517.9447236180905.
[I 2025-03-17 16:24:41,051] Trial 1 finished with value: 502.07035175879395 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 502.07035175879395.
[I 2025-03-17 16:24:41,993] Trial 2 finished with value: 515.0452261306533 and parameters: {'n_neighbors': 11}. Best is trial 1 with value: 502.07035175879395.
[I 2025-03-17 16:24:41,995] A new study created in memory with name: no-name-246a9cff-0f16-4b3b-8ebc-514a2553b9b4
[I 2025-03-17 16:26:55,492] Trial 0 finished with value: 611.1859296482412 and parameters: {'iters': 7, 'strat': 'normal'}. Best is trial 0 with value: 611.1859296482412.
[I 2025-03-17 16:26:55,495] A new study created in memory with name: no-name-58988cf7-1e19-4dde-83d8-caa42fc8a7a8
 14%|█▍

: 

In [None]:
imp, dic, summary = hpo_imputation_optimization(df, 10, timeout=500)

[I 2025-03-17 16:39:34,420] A new study created in memory with name: no-name-08c49731-8653-4650-b482-b91aeadd3e35
[I 2025-03-17 16:39:35,302] Trial 0 finished with value: 511.46231155778895 and parameters: {'n_neighbors': 12}. Best is trial 0 with value: 511.46231155778895.
[I 2025-03-17 16:39:36,191] Trial 1 finished with value: 502.07035175879395 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 502.07035175879395.
[I 2025-03-17 16:39:37,044] Trial 2 finished with value: 502.07035175879395 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 502.07035175879395.
[I 2025-03-17 16:39:37,832] Trial 3 finished with value: 543.4773869346734 and parameters: {'n_neighbors': 5}. Best is trial 1 with value: 502.07035175879395.
[I 2025-03-17 16:39:37,834] A new study created in memory with name: no-name-d2b0e645-bbbe-4058-a698-c2f1994b32ae
[I 2025-03-17 16:44:42,108] Trial 0 finished with value: 606.7587939698492 and parameters: {'iters': 20, 'strat': 'normal'}. Best 

Size index: [42, 3, 1710, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 43.547988374354475
Epoch: 1 , loss: 40.061202421026714
Epoch: 2 , loss: 37.64509346525548
Epoch: 3 , loss: 36.15832947876494
Epoch: 4 , loss: 35.18856748483949
Epoch: 5 , loss: 34.70665724802826
Epoch: 6 , loss: 34.22729516433457
Epoch: 7 , loss: 33.85847726918883
Epoch: 8 , loss: 33.49949010752015
Epoch: 9 , loss: 33.24330195734056
Epoch: 10 , loss: 33.03874832088665
Epoch: 11 , loss: 32.99106030544992
Epoch: 12 , loss: 32.881724858688095
Epoch: 13 , loss: 32.901935884508035
Epoch: 14 , loss: 32.71100991459216
Epoch: 15 , loss: 32.47873306274414
Epoch: 16 , loss: 32.45791322093899
Epoch: 17 , loss: 32.35649286690405
Epoch: 18 , loss: 32.284110651177876
Epoch: 19 , loss: 32.24648853883905
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-03-17 16:56:00,142] Trial 0 finished with value: 486.5577889447236 and parameters: {'layer': [256, 256], 'vae': False}. Best is trial 0 with value: 486.5577889447236.


In [None]:
summary

Unnamed: 0,Column,Best Method,Best Hyperparameters,KNN_Metric,MICE_Metric,MissForest_Metric,MIDAS_Metric
0,n,MIDAS,"{'layer': [256, 256], 'vae': False}",502.070352,606.758794,509.522613,486.557789


In [None]:
imp

{'n': {'KNN': (502.07035175879395, {'n_neighbors': 15}),
  'MICE': (606.7587939698492, {'iters': 20, 'strat': 'normal'}),
  'MissForest': (509.52261306532665, {'iters': 10}),
  'MIDAS': (486.5577889447236, {'layer': [256, 256], 'vae': False})}}

In [None]:
dic

{'n': ('MIDAS', {'layer': [256, 256], 'vae': False})}

: 