In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import optuna
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import time
from sklearn.metrics import mean_squared_error
import MIDASpy as md

### Data Preparation Function

# def prep(df: pd.DataFrame):
#     """
#     Preprocess the DataFrame by:
#     - Dropping rows with missing values and resetting the index.
#     - Converting object columns to categorical via LabelEncoder.
#     - Converting other columns to float (and then to int if >50% of values are integer-like).
#     - If any numeric column (not already marked as categorical) has only 2 unique values,
#       it is considered categorical and encoded.

#     Returns:
#         categorical_cols (list): List of columns encoded as categorical.
#         discrete_cols (list): List of columns that are numeric and integer-like.
#         cont_cols (list): List of remaining continuous numeric columns.
#         df_clean (DataFrame): The preprocessed DataFrame.
#         encoders (dict): Mapping from categorical column name to its LabelEncoder.
#     """
#     df_clean = df.dropna().reset_index(drop=True)
#     categorical_cols = []
#     discrete_cols = []
#     encoders = {}

#     for col in df_clean.columns:
#         if df_clean[col].dtype == 'object':
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le
#         else:
#             try:
#                 df_clean[col] = df_clean[col].astype(float)
#                 if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
#                     df_clean[col] = df_clean[col].astype(int)
#                     discrete_cols.append(col)
#             except (ValueError, TypeError):
#                 categorical_cols.append(col)
#                 le = LabelEncoder()
#                 df_clean[col] = le.fit_transform(df_clean[col])
#                 encoders[col] = le

#     for col in df_clean.columns:
#         if col not in categorical_cols and df_clean[col].nunique() == 2:
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le

#     continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

#     return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders
def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
    - Dropping rows with missing values and resetting the index.
    - Converting object columns to categorical via LabelEncoder.
    - Converting other columns to float (and then to int if >50% of values are integer-like).
    - If any numeric column (not already marked as categorical) has only 2 unique values,
      it is considered categorical and encoded.

    Returns:
        continuous_cols (list): List of remaining continuous numeric columns.
        discrete_cols (list): List of columns that are numeric and integer-like.
        categorical_cols (list): List of columns encoded as categorical.
        df_clean (DataFrame): The preprocessed DataFrame.
        encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values.
    df_clean = df.dropna().reset_index(drop=True)
    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Loop over each column to check its type and convert accordingly.
    for col in df_clean.columns:
        # If the column type is object, encode it as a categorical variable.
        if df_clean[col].dtype == 'object' or df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float first.
                df_clean[col] = df_clean[col].astype(float)
                # Check if most of the values are integer-like using np.isclose.
                # This computes the proportion of values where the modulus with 1 is nearly 0.
                if (np.isclose(df_clean[col] % 1, 0)).mean() > 0.5:
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion to float fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le
def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
    - Dropping rows with missing values and resetting the index.
    - Converting object columns to categorical via LabelEncoder.
    - Converting other columns to float (and then to int if >50% of values are integer-like).
    - If any numeric column (not already marked as categorical) has only 2 unique values,
      it is considered categorical and encoded.

    Returns:
        continuous_cols (list): List of remaining continuous numeric columns.
        discrete_cols (list): List of columns that are numeric and integer-like.
        categorical_cols (list): List of columns encoded as categorical.
        df_clean (DataFrame): The preprocessed DataFrame.
        encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values.
    df_clean = df.dropna().reset_index(drop=True)
    categorical_cols = []
    discrete_cols = []

    # Loop over each column to check its type and convert accordingly.
    for col in df_clean.columns:
        # If the column type is object, encode it as a categorical variable.
        if df_clean[col].dtype == 'object' or df_clean[col].nunique() == 2:
            categorical_cols.append(col)
        else:
            try:
                # Convert column to float first.
                df_clean[col] = df_clean[col].astype(float)
                # Check if most of the values are integer-like using np.isclose.
                # This computes the proportion of values where the modulus with 1 is nearly 0.
                if (np.isclose(df_clean[col] % 1, 0)).mean() > 0.5:
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion to float fails, treat the column as categorical.
                categorical_cols.append(col)
                

    # Determine continuous columns as those not flagged as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame with encoded categorical columns.
        encoders (dict): Dictionary mapping column names to their LabelEncoder.
    
    Returns:
        pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: Original DataFrame, DataFrame with missing values, and a mask DataFrame.
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

def simulate_missingness(df, show_missingness=False):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=42).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

In [2]:
# def do_knn(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False):
#     """
#     Impute missing values using KNN imputation over all columns.

#     Parameters:
#         df (pd.DataFrame): DataFrame with missing values.
#         continuous_cols (list): Names of continuous numeric columns.
#         discrete_cols (list): Names of discrete numeric columns.
#         categorical_cols (list): Names of categorical columns.
#         n_neighbors (int): Number of neighbors for KNN.
#         scale (bool): Whether to apply MinMaxScaler before imputation.

#     Returns:
#         pd.DataFrame: Imputed DataFrame.
#     """
#     df_imputed = df.copy()

#     # Optionally scale all numeric columns
#     if scale:
#         scaler = MinMaxScaler()
#         df_imputed[df.columns] = scaler.fit_transform(df_imputed)

#     # Apply KNN imputation to the entire dataframe
#     imputer = KNNImputer(n_neighbors=n_neighbors)
#     df_imputed[df.columns] = imputer.fit_transform(df_imputed)

#     # Reverse scale if needed
#     if scale:
#         df_imputed[df.columns] = scaler.inverse_transform(df_imputed)

#     # Post-process: round discrete and categorical values
#     if discrete_cols:
#         df_imputed[discrete_cols] = np.round(df_imputed[discrete_cols]).astype(int)
#     if categorical_cols:
#         df_imputed[categorical_cols] = np.round(df_imputed[categorical_cols]).astype(int)

#     return df_imputed

def do_knn(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False):
    """
    Impute missing values using KNN imputation over all columns.

    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        n_neighbors (int): Number of neighbors for KNN.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()
    encoders = {}

    # Encode categorical columns
    if categorical_cols:
        for col in categorical_cols:
            le = LabelEncoder()
            not_null = df_imputed[col].dropna()
            if not not_null.empty:
                le.fit(not_null)
                df_imputed[col] = df_imputed[col].map(lambda x: le.transform([x])[0] if pd.notnull(x) else np.nan)
                encoders[col] = le
            else:
                # All values missing in this column
                encoders[col] = None

    # Optionally scale numeric columns
    if scale:
        scaler = MinMaxScaler()
        df_imputed[df.columns] = scaler.fit_transform(df_imputed)

    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed[df.columns] = imputer.fit_transform(df_imputed)

    # Reverse scale
    if scale:
        df_imputed[df.columns] = scaler.inverse_transform(df_imputed)

    # Round discrete and categorical values
    if discrete_cols:
        df_imputed[discrete_cols] = np.round(df_imputed[discrete_cols]).astype(int)
    if categorical_cols:
        for col in categorical_cols:
            df_imputed[col] = np.round(df_imputed[col]).astype(int)
            if encoders[col] is not None:
                inv_map = dict(enumerate(encoders[col].classes_))
                df_imputed[col] = df_imputed[col].map(inv_map)

    return df_imputed

In [3]:
# knn_imputed = do_knn(df2, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False)

In [4]:
# def do_mice(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
#             iters=10, strat='normal', scale=False):
#     """
#     Impute missing values in a DataFrame using the MICE forest method.

#     Parameters:
#         df (pd.DataFrame): Input DataFrame with missing values.
#         continuous_cols (list of str): Names of continuous numeric columns.
#         discrete_cols (list of str): Names of discrete numeric columns.
#         categorical_cols (list of str): Names of categorical columns.
#         iters (int): Number of MICE iterations.
#         strat: ['normal', 'shap', 'fast'] or a dictionary specifying the mean matching strategy.
#         scale (bool): Whether to apply MinMaxScaler before imputation.

#     Returns:
#         pd.DataFrame: Imputed DataFrame.
#     """
#     df_imputed = df.copy()

#     if scale:
#         scaler = MinMaxScaler()
#         df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])

#     kernel = mf.ImputationKernel(
#         df_imputed,
#         random_state=0,
#         mean_match_strategy=strat,
#         variable_schema=None,  # Explicitly set variable_schema to None 
#         )

#     kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
#     df_completed = kernel.complete_data(dataset=0)

#     if discrete_cols:
#         df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)
#     if categorical_cols:
#         df_completed[categorical_cols] = df_completed[categorical_cols].round().astype(int)

#     if scale:
#         scaler = MinMaxScaler()
#         df_completed[continuous_cols] = scaler.inverse_transform(df_completed[continuous_cols])

#     return df_completed

def do_mice(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
            iters=10, strat='normal', scale=False):
    """
    Impute missing values in a DataFrame using the MICE forest method.

    Parameters:
        df (pd.DataFrame): Input DataFrame with missing values.
        continuous_cols (list of str): Names of continuous numeric columns.
        discrete_cols (list of str): Names of discrete numeric columns.
        categorical_cols (list of str): Names of categorical columns.
        iters (int): Number of MICE iterations.
        strat: ['normal', 'shap', 'fast'] or a dictionary specifying the mean matching strategy.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()
    encoders = {}

    # Encode categorical columns
    if categorical_cols:
        for col in categorical_cols:
            le = LabelEncoder()
            not_null = df_imputed[col].dropna()
            if not not_null.empty:
                le.fit(not_null)
                df_imputed[col] = df_imputed[col].map(lambda x: le.transform([x])[0] if pd.notnull(x) else np.nan)
                encoders[col] = le
            else:
                encoders[col] = None

    # Scale continuous columns if requested
    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])

    # Run MICE imputation
    kernel = mf.ImputationKernel(
        df_imputed,
        random_state=0,
        mean_match_strategy=strat,
        variable_schema=None
    )

    kernel.mice(iterations=iters, verbose=False)
    df_completed = kernel.complete_data(dataset=0)

    # Post-process discrete and categorical columns
    if discrete_cols:
        df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)

    if categorical_cols:
        for col in categorical_cols:
            df_completed[col] = np.round(df_completed[col]).astype(int)
            if encoders[col] is not None:
                inv_map = dict(enumerate(encoders[col].classes_))
                df_completed[col] = df_completed[col].map(inv_map)

    # Reverse scaling
    if scale:
        df_completed[continuous_cols] = scaler.inverse_transform(df_completed[continuous_cols])

    return df_completed

In [5]:
# mice_imputed = do_mice(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols,
                    #    iters=10, strat='normal', scale=False)

In [6]:

# def do_mf(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5, scale=False):
#     """
#     Impute missing values using MissForest.
    
#     Parameters:
#         df (pd.DataFrame): DataFrame with missing values.
#         continuous_cols (list): Names of continuous numeric columns.
#         discrete_cols (list): Names of discrete numeric columns.
#         categorical_cols (list): Names of categorical columns.
#         iters (int): Maximum number of iterations.
#         scale (bool): Whether to apply MinMaxScaler before imputation.
    
#     Returns:
#         pd.DataFrame: Imputed DataFrame.
#     """
#     df_imputed = df.copy()
    
#     if scale:
#         scaler = MinMaxScaler()
#         df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])
    
#     imputer = MissForest(max_iter=iters, categorical=categorical_cols)
#     df_imputed_result = imputer.fit_transform(df_imputed)
    
#     if discrete_cols:
#         df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)
    
#     if categorical_cols:
#         df_imputed_result[categorical_cols] = df_imputed_result[categorical_cols].round().astype(int)
    
#     if scale:
#         # Reverse scaling for continuous columns
#         df_imputed_result[continuous_cols] = scaler.inverse_transform(df_imputed_result[continuous_cols])
    
#     return df_imputed_result

# # mf_imputed = do_mf(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, iters=5, scale=False)

def do_mf(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5, scale=False):
    """
    Impute missing values using MissForest.
    
    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        iters (int): Maximum number of iterations.
        scale (bool): Whether to apply MinMaxScaler before imputation.
    
    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()
    encoders = {}

    # Encode categorical columns
    if categorical_cols:
        for col in categorical_cols:
            le = LabelEncoder()
            not_null = df_imputed[col].dropna()
            if not not_null.empty:
                le.fit(not_null)
                df_imputed[col] = df_imputed[col].map(lambda x: le.transform([x])[0] if pd.notnull(x) else np.nan)
                encoders[col] = le
            else:
                encoders[col] = None

    # Scale continuous columns
    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])

    # Impute with MissForest
    imputer = MissForest(max_iter=iters)
    df_imputed_result = pd.DataFrame(imputer.fit_transform(df_imputed), columns=df.columns)

    # Post-process discrete columns
    if discrete_cols:
        df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)

    # Post-process categorical columns
    if categorical_cols:
        for col in categorical_cols:
            df_imputed_result[col] = df_imputed_result[col].round().astype(int)
            if encoders[col] is not None:
                inv_map = dict(enumerate(encoders[col].classes_))
                df_imputed_result[col] = df_imputed_result[col].map(inv_map)

    # Reverse scaling
    if scale:
        df_imputed_result[continuous_cols] = scaler.inverse_transform(df_imputed_result[continuous_cols])

    return df_imputed_result

In [7]:
def do_midas(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
              layer:list=[256,256], vae:bool=True, samples:int=10, random_seed:float=96 ):
    """
    Imputes missing values using the MIDAS model.
    
    Parameters:
      df (pd.DataFrame): Input dataframe.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but non-continuous) column names.
      categorical_cols (list): List of categorical column names.
      
    Returns:
      imps (list): A list of imputed dataframes.
    """
    # 1. Convert categorical columns and get categorical metadata.
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])
    
    # 2. Define the numeric columns.
    num_cols = discrete_cols + continuous_cols  # these are the numeric columns

    # 3. Drop original categorical columns and combine with the converted categorical data.
    df_copy = df.drop(columns=categorical_cols,axis=1)
    constructor_list = [df_copy, md_cat_data]
    data_in = pd.concat(constructor_list, axis=1)
    
    # 4. Scale non-categorical columns BEFORE imputation.
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])
    
    # 5. Build and train the imputer using the scaled data.
    imputer = md.Midas(layer_structure=layer, vae_layer=vae, seed=random_seed, input_drop=0.75)
    # Use md_cats as softmax columns for categorical outputs.
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)
    
    # 6. Generate imputations.
    imps = imputer.generate_samples(m=samples).output_list
    
    # 7. Post-process each imputed DataFrame.
    for idx, imp_df in enumerate(imps):
        # Reverse transform the numeric columns.
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])
        
        # Process categorical columns.
        # For each softmax group in md_cats, choose the column with the highest probability.
        tmp_cat = []
        for group in md_cats:
            # idxmax returns the column name with maximum value per row for this group.
            tmp_cat.append(imp_df[group].idxmax(axis=1))
        # Assume the order of md_cats corresponds to categorical_cols.
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        
        # Drop the softmax columns.
        flat_cats = [col for group in md_cats for col in group]
        tmp_cat = [imp_df[x].idxmax(axis=1) for x in md_cats]
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        imp_df = pd.concat([imp_df, cat_df], axis=1).drop(columns=flat_cats, axis=1)
        
        # Handle discrete data by rounding the values.
        imp_df[discrete_cols] = imp_df[discrete_cols].round()
        
        # Replace the processed DataFrame in the list.
        imps[idx] = imp_df

        ### make method info
        method_info = f'MIDAS, params: samples={samples} ,layer={layer}, vae={vae}'
    return imps, method_info

In [8]:
# midas_imputed = do_midas(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [9]:
import optuna

In [10]:
# ------------------------------------------------------------------------------
# Missingness Creation Function
# ------------------------------------------------------------------------------
def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

# ------------------------------------------------------------------------------
# Improved Evaluation Function
# ------------------------------------------------------------------------------
def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue
        
        print([type(df_imp) for df_imp in imputed_dfs])

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error

# ------------------------------------------------------------------------------
# Hyperparameter Optimization Function using Optuna
# ------------------------------------------------------------------------------
def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    missing_percent, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, summary_table, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [11]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df,missing_percent=20,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [12]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=df,missing_percent=20,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [13]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_midas,original_df=df,missing_percent=20,timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [14]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mice,original_df=df,missing_percent=20,timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [15]:
new_df = pd.read_excel(r"C:\Users\Matin\Downloads\Data for Dr.Matin.xlsx", 's1')
new_df.drop(['n', 'ID','Gen.code'],axis=1,inplace=True)
new_df = new_df[:300]
# continuous_cols, discrete_cols, categorical_cols, df2, encoders = prep(new_df)


In [16]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df2,missing_percent=30,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [17]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=new_df,missing_percent=30, timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [18]:
def simulate_missingness(df, show_missingness=False, random_state=42):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=random_state).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df


def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for cat in categorical_cols:
        if cat in best_imputed_df:
            best_imputed_df[cat] = best_imputed_df[cat].astype(object)

    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = \
                imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error


def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    df_missing, 
                                    mask_df, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    # _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, _, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [19]:
def do_midas(df,
             continuous_cols=None,
             discrete_cols=None,
             categorical_cols=None,
             layer: list = [256, 256],
             vae: bool = True,
             samples: int = 10,
             random_seed: float = 96):
    """
    Imputes missing values using the MIDAS model.

    Parameters:
      df (pd.DataFrame): Input dataframe with NaNs in both numeric & categorical.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but not continuous) column names.
      categorical_cols (list): List of categorical column names.

    Returns:
      imps (list): A list of imputed dataframes, with original dtypes restored.
      method_info (str): Summary of MIDAS params used.
    """
    # 1. One‑hot encode the categoricals
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])

    # 2. Build the “wide” DF: drop raw cats, append one‑hots
    df_num = df.drop(columns=categorical_cols)
    data_in = pd.concat([df_num, md_cat_data], axis=1)

    # 3. Record & re‑insert the NaN locations so MIDAS sees them as missing
    na_mask = data_in.isnull()
    data_in[na_mask] = np.nan

    # 4. Scale only the numeric columns in place
    num_cols = discrete_cols + continuous_cols
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

    # 5. Build & train the MIDAS model
    imputer = md.Midas(
        layer_structure=layer,
        vae_layer=vae,
        seed=random_seed,
        input_drop=0.75
    )
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)

    # 6. Generate multiple imputations
    raw_imps = imputer.generate_samples(m=samples).output_list

    # 7. Decode each imputed DF back to original structure
    flat_cats = [c for grp in md_cats for c in grp]
    imps = []

    for imp_df in raw_imps:
        # 7a. inverse‑scale numeric cols
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])

        # 7b. decode one‑hots (before dropping them!)
        decoded = {}
        for i, grp in enumerate(md_cats):
            # just in case, only keep those actually present
            present = [c for c in grp if c in imp_df.columns]
            # idxmax → gives the dummy column name with highest prob
            decoded[categorical_cols[i]] = imp_df[present].idxmax(axis=1)

        cat_df = pd.DataFrame(decoded, index=imp_df.index)

        # 7c. now drop the dummy cols
        base = imp_df.drop(columns=flat_cats, errors='ignore')

        # 7d. concat in your decoded cat columns
        merged = pd.concat([base, cat_df], axis=1)

        # 7e. round discrete cols
        merged[discrete_cols] = merged[discrete_cols].round().astype(int)

        imps.append(merged)

    method_info = f"MIDAS, params: samples={samples}, layer={layer}, vae={vae}"
    return imps, method_info


def run_full_pipeline(df: pd.DataFrame, 
                      simulate:bool=False,               # True for simulated missingness, False for random missingness
                      missingness_value: float = 10.0,   # used only for random missingness (percent)
                      show_missingness: bool = False,
                      timelimit: int = 600, 
                      min_trials: int = 20, 
                      random_seed: int = 96):
    """
    Run the full pipeline to find the best hyperparameters for each imputation method.

    The pipeline performs these steps:
    
      1. Preprocesses the DataFrame using `prep`, which cleans the data,
         encodes categorical variables, and splits features into continuous,
         discrete, and categorical lists.
      2. Introduces missingness using either simulated missingness (reintroducing missingness 
         based on the original NaN proportions) or random missingness (dropping values randomly
         given a specified missing percentage).
      3. Runs hyperparameter optimization (via `optimize_imputation_hyperparams`) for each candidate 
         imputation method (e.g., do_knn, do_mice, do_mf, do_midas).
         
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        missing_type (str): "simulate" to simulate missingness using original missing proportions,
                            "random" to drop values randomly.
        missingness_value (float): Percentage of missingness (only used if missing_type == "random").
        show_missingness (bool): If True, prints missingness comparison when using simulate missingness.
        timelimit (int): Time limit (in seconds) for each hyperparameter optimization study.
        min_trials (int): Minimum number of trials for each study.
        random_seed (int): Random seed for reproducibility.

    Returns:
        dict: A dictionary where the keys are method names (strings) and the values are the best 
              hyperparameter dictionaries (from the best Optuna trial) for that method.
    """

    # # Step 1: Preprocess Data
    # Note: For simulation, the missing proportions are taken from the original df.
    if simulate: 
        # simulate_missingness returns: (complete_df, df_with_missing, missing_mask)
        df_complete, df_missing, mask_df = simulate_missingness(df, 
                                                                      show_missingness=show_missingness,
                                                                      random_state=random_seed)
    else:   
        df_complete, df_missing, mask_df = create_missings(df, 
                                                                 missingness=missingness_value, 
                                                                 random_seed=random_seed)
        
    # Step 2: Preprocess Data, convert categorical cols to encoded values and find the data types.
    continuous_cols, discrete_cols, categorical_cols = prep(df)
  
    candidate_methods = {
        "KNN": do_knn,
        "MICE": do_mice,
        "MissForest": do_mf,
        "MIDAS": do_midas
    }
    

    best_hyperparams = {}
    
    # Optimize hyperparameters for each imputation method candidate.
    for method_name, imputation_func in candidate_methods.items():
        print(f"\nOptimizing hyperparameters for {method_name}...")
        try:
            best_trial, best_value = optimize_imputation_hyperparams(
                imputation_func=imputation_func,
                original_df=df_complete,
                df_missing=df_missing,
                mask_df=mask_df,
                continuous_cols=continuous_cols,
                discrete_cols=discrete_cols,
                categorical_cols=categorical_cols,
                timelimit=timelimit,
                min_trials=min_trials,
                random_seed=random_seed
            )
            best_hyperparams[method_name] = best_trial.params
            print(f'Best hyperparameters for {method_name}: {best_hyperparams[method_name]} with best agg error of {best_value}')
        except Exception as e:
            print(f"An error occurred while optimizing {method_name}: {e}")
            best_hyperparams[method_name] = None

        # Optimize hyperparameters for each imputation method candidate.
    
    for key, val in best_hyperparams.items():
        if key == 'KNN':
            df_knn = do_knn(df_missing, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, n_neighbors=val['n_neighbors'], scale=val['scale'])

        elif key == 'MICE':
            df_mice = do_mice(df_missing, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, iters=val['iters'], strat=val['strat'], scale=val['scale'])

        elif key == 'MissForest':
            df_mf = do_mf(df_missing, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, iters=val['iters'], scale=val['scale']) 

        elif key == 'MIDAS':
            df_midas, _ = do_midas(df_missing, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, layer=val['layer'], vae=val['vae'], samples=val['samples'])

    # Create a list of imputed DataFrames.  
    imputed_dfs = [df_knn, df_mice, df_mf, df_midas] 
    # decoded_imputed_dfs = []
    # for i in imputed_dfs:
    #     decoded_df = reverse_encoding(i, encoders)
    #     decoded_imputed_dfs.append(decoded_df)
          
    # Create a list of method names.    
    method_names = ['KNN', 'MICE', 'MissForest', 'MIDAS']
    
    best_method_per_col = {}
    summary_list = []

    for col in df_missing.columns:
        # Determine the data type label.
        if col in continuous_cols:
            col_data_type = "Continuous"
        elif col in discrete_cols:
            col_data_type = "Discrete"
        elif col in categorical_cols:
            col_data_type = "Categorical"
        else:
            col_data_type = str(df_missing[col].dtype)

        # Only evaluate columns that had artificial missing values.
        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_data_type,
                'Best Method': None,
                'Metric': np.nan,
                'Error_SD': np.nan,
                'Max_Error': np.nan,
                'Min_Error': np.nan,
                'Within_10pct': np.nan
            })
            continue
        metrics = []
        error_sd = np.nan
        max_error = np.nan
        min_error = np.nan
        within_10pct = np.nan
        
        if col in continuous_cols or col in discrete_cols:
            # Ensure the original column is numeric.
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise ValueError(f"Column '{col}' is marked as numeric but contains non-numeric values.")
            for df_imp in imputed_dfs:
                # Convert values to numeric.
                imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean() if not errors.empty else np.nan
                metrics.append(mae)
            best_idx = np.nanargmin(metrics)
            best_metric = metrics[best_idx]
            
            # Compute additional metrics for the best method.
            best_imp_vals = pd.to_numeric(imputed_dfs[best_idx][col][mask_df[col]], errors='coerce')
            best_orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            errors = np.abs(best_imp_vals - best_orig_vals)
            error_sd = errors.std() if not errors.empty else np.nan
            max_error = errors.max() if not errors.empty else np.nan
            min_error = errors.min() if not errors.empty else np.nan
            
            # Compute fraction within ±10%.
            # For nonzero original values, check error <= 0.1 * |original|.
            # For zeros, require the imputed value to be exactly 0.
            condition = ((best_orig_vals != 0) & (errors <= 0.1 * best_orig_vals.abs())) | \
                        ((best_orig_vals == 0) & (errors == 0))
            within_10pct = condition.mean() if not condition.empty else np.nan
            
        elif col in categorical_cols or pd.api.types.is_string_dtype(df_complete[col]):
            # For categorical columns, compute accuracy.
            for df_imp in imputed_dfs:
                correct = (df_imp[col][mask_df[col]] == df_complete[col][mask_df[col]])
                acc = correct.mean() if not correct.empty else np.nan
                metrics.append(acc)
            best_idx = np.nanargmax(metrics)
            best_metric = metrics[best_idx]
            # Extra metrics are not applicable for categoricals.
            error_sd = np.nan
            max_error = np.nan
            min_error = np.nan
            within_10pct = np.nan
        else:
            best_idx = None
            best_metric = np.nan
        
        best_method = method_names[best_idx] if best_idx is not None else None
        best_method_per_col[col] = best_idx
        
        summary_list.append({
            'Column': col,
            'Data Type': col_data_type,
            'Best Method': best_method,
            'Metric': best_metric,
            'Error_SD': error_sd,
            'Max_Error': max_error,
            'Min_Error': min_error,
            'Within_10pct': within_10pct
        })
    
    summary_table = pd.DataFrame(summary_list)
    
    # Build best-imputed DataFrame by replacing masked entries with values from the best method.
    best_imputed_df = df_complete.copy()
    for col in df_complete.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    return best_imputed_df, summary_table




In [25]:
def run_full_pipeline2(df: pd.DataFrame, 
                      simulate: bool = False,               
                      missingness_value: float = 10.0,   
                      show_missingness: bool = False,
                      timelimit: int = 600, 
                      min_trials: int = 20, 
                      random_seed: int = 96):
    """
    Run the full pipeline to find the best hyperparameters for each imputation method.
    """
    # Step 1: Create missingness (simulated or random)
    if simulate: 
        df_complete, df_missing, mask_df = simulate_missingness(
            df, show_missingness=show_missingness, random_state=random_seed)
    else:   
        df_complete, df_missing, mask_df = create_missings(
            df, missingness=missingness_value, random_seed=random_seed)

    # Step 2: Preprocess for column types
    continuous_cols, discrete_cols, categorical_cols = prep(df)
  
    candidate_methods = {
        "KNN": do_knn,
        "MICE": do_mice,
        "MissForest": do_mf,
        "MIDAS": do_midas
    }

    best_hyperparams = {}
    imputed_dfs = []
    method_names = []

    # Step 3: Optimize hyperparameters per method
    for method_name, imputation_func in candidate_methods.items():
        print(f"\nOptimizing hyperparameters for {method_name}...")
        try:
            best_trial, best_value = optimize_imputation_hyperparams(
                imputation_func=imputation_func,
                original_df=df_complete,
                df_missing=df_missing,
                mask_df=mask_df,
                continuous_cols=continuous_cols,
                discrete_cols=discrete_cols,
                categorical_cols=categorical_cols,
                timelimit=timelimit,
                min_trials=min_trials,
                random_seed=random_seed
            )
            best_hyperparams[method_name] = best_trial.params
            print(f'Best hyperparameters for {method_name}: {best_trial.params} with best agg error of {best_value}')
        except Exception as e:
            print(f"An error occurred while optimizing {method_name}: {e}")
            best_hyperparams[method_name] = None

    # Step 4: Run best imputation for each method and collect valid results
    if best_hyperparams.get('KNN'):
        try:
            df_knn = do_knn(df_missing, continuous_cols=continuous_cols,
                            discrete_cols=discrete_cols,
                            categorical_cols=categorical_cols,
                            n_neighbors=best_hyperparams['KNN']['n_neighbors'],
                            scale=best_hyperparams['KNN']['scale'])
            imputed_dfs.append(df_knn)
            method_names.append('KNN')
        except Exception as e:
            print(f"Failed to impute with KNN: {e}")

    if best_hyperparams.get('MICE'):
        try:
            df_mice = do_mice(df_missing, continuous_cols=continuous_cols,
                              discrete_cols=discrete_cols,
                              categorical_cols=categorical_cols,
                              iters=best_hyperparams['MICE']['iters'],
                              strat=best_hyperparams['MICE']['strat'],
                              scale=best_hyperparams['MICE']['scale'])
            imputed_dfs.append(df_mice)
            method_names.append('MICE')
        except Exception as e:
            print(f"Failed to impute with MICE: {e}")

    if best_hyperparams.get('MissForest'):
        try:
            df_mf = do_mf(df_missing, continuous_cols=continuous_cols,
                          discrete_cols=discrete_cols,
                          categorical_cols=categorical_cols,
                          iters=best_hyperparams['MissForest']['iters'],
                          scale=best_hyperparams['MissForest']['scale'])
            imputed_dfs.append(df_mf)
            method_names.append('MissForest')
        except Exception as e:
            print(f"Failed to impute with MissForest: {e}")

    if best_hyperparams.get('MIDAS'):
        try:
            df_midas, _ = do_midas(df_missing, continuous_cols=continuous_cols,
                                   discrete_cols=discrete_cols,
                                   categorical_cols=categorical_cols,
                                   layer=best_hyperparams['MIDAS']['layer'],
                                   vae=best_hyperparams['MIDAS']['vae'],
                                   samples=best_hyperparams['MIDAS']['samples'])
            imputed_dfs.append(df_midas)
            method_names.append('MIDAS')
        except Exception as e:
            print(f"Failed to impute with MIDAS: {e}")

    best_method_per_col = {}
    summary_list = []

    # Step 5: Evaluate and select best method per column
    for col in df_missing.columns:
        if col in continuous_cols:
            col_data_type = "Continuous"
        elif col in discrete_cols:
            col_data_type = "Discrete"
        elif col in categorical_cols:
            col_data_type = "Categorical"
        else:
            col_data_type = str(df_missing[col].dtype)

        if mask_df[col].sum() == 0:
            summary_list.append({
                'Column': col, 'Data Type': col_data_type, 'Best Method': None,
                'Metric': np.nan, 'Error_SD': np.nan, 'Max_Error': np.nan,
                'Min_Error': np.nan, 'Within_10pct': np.nan
            })
            best_method_per_col[col] = None
            continue

        metrics = []
        error_sd = max_error = min_error = within_10pct = np.nan

        if col in continuous_cols or col in discrete_cols:
            for df_imp in imputed_dfs:
                imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean() if not errors.empty else np.nan
                metrics.append(mae)
            best_idx = np.nanargmin(metrics)
            best_metric = metrics[best_idx]

            best_imp_vals = pd.to_numeric(imputed_dfs[best_idx][col][mask_df[col]], errors='coerce')
            best_orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            errors = np.abs(best_imp_vals - best_orig_vals)
            error_sd = errors.std() if not errors.empty else np.nan
            max_error = errors.max() if not errors.empty else np.nan
            min_error = errors.min() if not errors.empty else np.nan
            condition = ((best_orig_vals != 0) & (errors <= 0.1 * best_orig_vals.abs())) | \
                        ((best_orig_vals == 0) & (errors == 0))
            within_10pct = condition.mean() if not condition.empty else np.nan

        elif col in categorical_cols or pd.api.types.is_string_dtype(df_complete[col]):
            print([type(df_imp) for df_imp in imputed_dfs])

            for df_imp in imputed_dfs:
                correct = (df_imp[col][mask_df[col]] == df_complete[col][mask_df[col]])
                acc = correct.mean() if not correct.empty else np.nan
                metrics.append(acc)
            best_idx = np.nanargmax(metrics)
            best_metric = metrics[best_idx]

        else:
            best_idx = None
            best_metric = np.nan

        best_method = method_names[best_idx] if best_idx is not None else None
        best_method_per_col[col] = best_idx

        summary_list.append({
            'Column': col,
            'Data Type': col_data_type,
            'Best Method': best_method,
            'Metric': best_metric,
            'Error_SD': error_sd,
            'Max_Error': max_error,
            'Min_Error': min_error,
            'Within_10pct': within_10pct
        })

    summary_table = pd.DataFrame(summary_list)

    # Step 6: Final best-imputed DataFrame
    best_imputed_df = df_complete.copy()
    for col in df_complete.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    return best_imputed_df, summary_table

In [27]:
def run_full_pipeline3(df: pd.DataFrame, 
                      simulate: bool = False,
                      missingness_value: float = 10.0,
                      show_missingness: bool = False,
                      timelimit: int = 600,
                      min_trials: int = 20,
                      random_seed: int = 96):
    if simulate:
        df_complete, df_missing, mask_df = simulate_missingness(
            df, show_missingness=show_missingness, random_state=random_seed
        )
    else:
        df_complete, df_missing, mask_df = create_missings(
            df, missingness=missingness_value, random_seed=random_seed
        )

    continuous_cols, discrete_cols, categorical_cols = prep(df)

    candidate_methods = {
        "KNN": do_knn,
        "MICE": do_mice,
        "MissForest": do_mf,
        "MIDAS": do_midas
    }

    best_hyperparams = {}

    for method_name, imputation_func in candidate_methods.items():
        print(f"\nOptimizing hyperparameters for {method_name}...")
        try:
            best_trial, best_value = optimize_imputation_hyperparams(
                imputation_func=imputation_func,
                original_df=df_complete,
                df_missing=df_missing,
                mask_df=mask_df,
                continuous_cols=continuous_cols,
                discrete_cols=discrete_cols,
                categorical_cols=categorical_cols,
                timelimit=timelimit,
                min_trials=min_trials,
                random_seed=random_seed
            )
            best_hyperparams[method_name] = best_trial.params
            print(f'Best hyperparameters for {method_name}: {best_hyperparams[method_name]} with best agg error of {best_value}')
        except Exception as e:
            print(f"An error occurred while optimizing {method_name}: {e}")
            best_hyperparams[method_name] = None

    imputed_dfs = []
    method_names = []

    for method in ['KNN', 'MICE', 'MissForest', 'MIDAS']:
        val = best_hyperparams.get(method)
        if not val:
            continue
        try:
            if method == 'KNN':
                df_knn = do_knn(df_missing, continuous_cols=continuous_cols, 
                                discrete_cols=discrete_cols, categorical_cols=categorical_cols, 
                                n_neighbors=val['n_neighbors'], scale=val['scale'])
                imputed_dfs.append(df_knn)
                method_names.append('KNN')

            elif method == 'MICE':
                df_mice = do_mice(df_missing, continuous_cols=continuous_cols, 
                                  discrete_cols=discrete_cols, categorical_cols=categorical_cols, 
                                  iters=val['iters'], strat=val['strat'], scale=val['scale'])
                imputed_dfs.append(df_mice)
                method_names.append('MICE')

            elif method == 'MissForest':
                df_mf = do_mf(df_missing, continuous_cols=continuous_cols, 
                              discrete_cols=discrete_cols, categorical_cols=categorical_cols, 
                              iters=val['iters'], scale=val['scale'])
                imputed_dfs.append(df_mf)
                method_names.append('MissForest')

            elif method == 'MIDAS':
                df_midas_list, _ = do_midas(df_missing, continuous_cols=continuous_cols,
                                            discrete_cols=discrete_cols,
                                            categorical_cols=categorical_cols,
                                            layer=val['layer'], vae=val['vae'], 
                                            samples=val['samples'])
                imputed_dfs.extend(df_midas_list)
                method_names.extend([f'MIDAS_{i+1}' for i in range(len(df_midas_list))])

        except Exception as e:
            print(f"Failed to impute with {method}: {e}")

    best_method_per_col = {}
    summary_list = []

    for col in df_missing.columns:
        if col in continuous_cols:
            col_data_type = "Continuous"
        elif col in discrete_cols:
            col_data_type = "Discrete"
        elif col in categorical_cols:
            col_data_type = "Categorical"
        else:
            col_data_type = str(df_missing[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_data_type,
                'Best Method': None,
                'Metric': np.nan,
                'Error_SD': np.nan,
                'Max_Error': np.nan,
                'Min_Error': np.nan,
                'Within_10pct': np.nan
            })
            continue

        metrics = []
        error_sd = np.nan
        max_error = np.nan
        min_error = np.nan
        within_10pct = np.nan

        if col in continuous_cols or col in discrete_cols:
            for df_imp in imputed_dfs:
                imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean() if not errors.empty else np.nan
                metrics.append(mae)
            best_idx = np.nanargmin(metrics)
            best_metric = metrics[best_idx]

            best_imp_vals = pd.to_numeric(imputed_dfs[best_idx][col][mask_df[col]], errors='coerce')
            best_orig_vals = pd.to_numeric(df_complete[col][mask_df[col]], errors='coerce')
            errors = np.abs(best_imp_vals - best_orig_vals)
            error_sd = errors.std() if not errors.empty else np.nan
            max_error = errors.max() if not errors.empty else np.nan
            min_error = errors.min() if not errors.empty else np.nan
            condition = ((best_orig_vals != 0) & (errors <= 0.1 * best_orig_vals.abs())) | \
                        ((best_orig_vals == 0) & (errors == 0))
            within_10pct = condition.mean() if not condition.empty else np.nan

        elif col in categorical_cols or pd.api.types.is_string_dtype(df_complete[col]):
            for df_imp in imputed_dfs:
                correct = (df_imp[col][mask_df[col]] == df_complete[col][mask_df[col]])
                acc = correct.mean() if not correct.empty else np.nan
                metrics.append(acc)
            best_idx = np.nanargmax(metrics)
            best_metric = metrics[best_idx]

        else:
            best_idx = None
            best_metric = np.nan

        best_method = method_names[best_idx] if best_idx is not None else None
        best_method_per_col[col] = best_idx

        summary_list.append({
            'Column': col,
            'Data Type': col_data_type,
            'Best Method': best_method,
            'Metric': best_metric,
            'Error_SD': error_sd,
            'Max_Error': max_error,
            'Min_Error': min_error,
            'Within_10pct': within_10pct
        })

    summary_table = pd.DataFrame(summary_list)

    best_imputed_df = df_complete.copy()
    for col in df_complete.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    return best_imputed_df, summary_table


In [21]:
# best_params = run_full_pipeline(new_df,timelimit=60,random_seed=96)

In [28]:
impsss, table = run_full_pipeline3(new_df,timelimit=60,random_seed=96)

[I 2025-04-17 03:59:43,460] A new study created in memory with name: no-name-b2804d32-bc31-45f3-8978-dcb86a13e862



Optimizing hyperparameters for KNN...


[I 2025-04-17 03:59:43,728] Trial 0 finished with value: 130465.4207477649 and parameters: {'n_neighbors': 13, 'scale': False}. Best is trial 0 with value: 130465.4207477649.
[I 2025-04-17 03:59:43,946] Trial 1 finished with value: 134917.0266492101 and parameters: {'n_neighbors': 4, 'scale': True}. Best is trial 0 with value: 130465.4207477649.
[I 2025-04-17 03:59:44,158] Trial 2 finished with value: 128910.987206718 and parameters: {'n_neighbors': 5, 'scale': True}. Best is trial 2 with value: 128910.987206718.
[I 2025-04-17 03:59:44,356] Trial 3 finished with value: 134917.0266492101 and parameters: {'n_neighbors': 4, 'scale': True}. Best is trial 2 with value: 128910.987206718.
[I 2025-04-17 03:59:44,553] Trial 4 finished with value: 126270.22596605406 and parameters: {'n_neighbors': 14, 'scale': True}. Best is trial 4 with value: 126270.22596605406.
[I 2025-04-17 03:59:44,718] Trial 5 finished with value: 155476.810668925 and parameters: {'n_neighbors': 3, 'scale': False}. Best is

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 7
  scale: True
Best Objective Value (aggregated error): 125437.40255018426
Best hyperparameters for KNN: {'n_neighbors': 7, 'scale': True} with best agg error of 125437.40255018426

Optimizing hyperparameters for MICE...


  bachelor_preds = bachelor_preds.astype(_PRE_LINK_DATATYPE)
  candidate_preds = candidate_preds.astype(_PRE_LINK_DATATYPE)  # type: ignore
[W 2025-04-17 03:59:48,418] Trial 0 failed with parameters: {'iters': 12, 'strat': 'shap', 'scale': True} because of the following error: ValueError('data must be finite, check for nan or inf values').
Traceback (most recent call last):
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_36436\3942454051.py", line 216, in objective
    imputed_df = imputation_func(df_missing,
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_36436\3092344904.py", line 90, in do_mice
    kernel.mice(iterations=iters, verbose=False)
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\miceforest\imputation_kernel.py", line 1186, in mice
    imputation_values = self._mean_ma

An error occurred while optimizing MICE: data must be finite, check for nan or inf values

Optimizing hyperparameters for MissForest...


 67%|██████▋   | 2/3 [00:14<00:07,  7.11s/it]
100%|██████████| 2/2 [00:00<00:00,  2.38it/s]
[I 2025-04-17 04:00:03,863] Trial 0 finished with value: 133132.05827693842 and parameters: {'iters': 3, 'scale': False}. Best is trial 0 with value: 133132.05827693842.
 67%|██████▋   | 2/3 [00:14<00:07,  7.30s/it]
100%|██████████| 2/2 [00:00<00:00,  2.97it/s]
[I 2025-04-17 04:00:19,520] Trial 1 finished with value: 133145.73115992727 and parameters: {'iters': 3, 'scale': True}. Best is trial 0 with value: 133132.05827693842.
 13%|█▎        | 2/15 [00:12<01:24,  6.47s/it]
100%|██████████| 2/2 [00:00<00:00,  3.11it/s]
[I 2025-04-17 04:00:33,439] Trial 2 finished with value: 133132.05827693842 and parameters: {'iters': 15, 'scale': False}. Best is trial 0 with value: 133132.05827693842.
 33%|███▎      | 2/6 [00:13<00:26,  6.73s/it]
100%|██████████| 2/2 [00:00<00:00,  3.12it/s]
[I 2025-04-17 04:00:47,882] Trial 3 finished with value: 133145.73115992727 and parameters: {'iters': 6, 'scale': True}. 

Optimization completed!
Best Trial Hyperparameters:
  iters: 3
  scale: False
Best Objective Value (aggregated error): 133132.05827693842
Best hyperparameters for MissForest: {'iters': 3, 'scale': False} with best agg error of 133132.05827693842

Optimizing hyperparameters for MIDAS...
Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 15.636027336120605
Epoch: 1 , loss: 12.337962044609917
Epoch: 2 , loss: 9.15252325269911
Epoch: 3 , loss: 8.159549713134766
Epoch: 4 , loss: 7.996186362372504
Epoch: 5 , loss: 7.822097937266032
Epoch: 6 , loss: 7.579149934980604
Epoch: 7 , loss: 7.499464909235637
Epoch: 8 , loss: 7.326467196146647
Epoch: 9 , loss: 7.373421165678236
Epoch: 10 , loss: 7.259845124350654
Epoch: 11 , loss: 7.192229244444105
Epoch: 12 , loss: 7.185925722122192
Epoch: 13 , loss: 7.138575024074978
Epoch: 14 , loss: 7.124434126747979
Epoch: 15 , loss: 7.085193819469875
Epoch: 16 , loss: 7.072419431474474
Epoch: 17 , loss: 7.10293001598782
Epoc

[I 2025-04-17 04:01:15,481] Trial 0 finished with value: 136012.90598441515 and parameters: {'layer': [512, 256], 'vae': False, 'samples': 16}. Best is trial 0 with value: 136012.90598441515.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 16.207295894622803
Epoch: 1 , loss: 14.146910137600369
Epoch: 2 , loss: 12.069238079918755
Epoch: 3 , loss: 9.983858479393852
Epoch: 4 , loss: 8.95934862560696
Epoch: 5 , loss: 8.43393784099155
Epoch: 6 , loss: 8.225883536868626
Epoch: 7 , loss: 7.938134670257568
Epoch: 8 , loss: 7.808457692464192
Epoch: 9 , loss: 7.727700445387098
Epoch: 10 , loss: 7.6380121443006725
Epoch: 11 , loss: 7.504967716005114
Epoch: 12 , loss: 7.495669523874919
Epoch: 13 , loss: 7.410564766989814
Epoch: 14 , loss: 7.345555861790975
Epoch: 15 , loss: 7.267838133705987
Epoch: 16 , loss: 7.2920101748572455
Epoch: 17 , loss: 7.245056258307563
Epoch: 18 , loss: 7.265906201468574
Epoch: 19 , loss: 7.193224694993761
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 04:01:29,750] Trial 1 finished with value: 139271.67633861475 and parameters: {'layer': [256, 256], 'vae': False, 'samples': 14}. Best is trial 0 with value: 136012.90598441515.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 16.207295682695175
Epoch: 1 , loss: 14.146891487969292
Epoch: 2 , loss: 12.069215774536133
Epoch: 3 , loss: 9.983822769588894
Epoch: 4 , loss: 8.959352440304226
Epoch: 5 , loss: 8.433939377466837
Epoch: 6 , loss: 8.22588555018107
Epoch: 7 , loss: 7.938143067889744
Epoch: 8 , loss: 7.808442539638943
Epoch: 9 , loss: 7.727703703774346
Epoch: 10 , loss: 7.63799622323778
Epoch: 11 , loss: 7.504962046941121
Epoch: 12 , loss: 7.495663298500909
Epoch: 13 , loss: 7.410571283764309
Epoch: 14 , loss: 7.34552706612481
Epoch: 15 , loss: 7.267837365468343
Epoch: 16 , loss: 7.291982730229695
Epoch: 17 , loss: 7.245056788126628
Epoch: 18 , loss: 7.265892399681939
Epoch: 19 , loss: 7.193230523003472
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 04:01:44,662] Trial 2 finished with value: 139274.98515212914 and parameters: {'layer': [256, 256], 'vae': False, 'samples': 15}. Best is trial 0 with value: 136012.90598441515.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 17.09950171576606
Epoch: 1 , loss: 15.891795688205296
Epoch: 2 , loss: 15.031748453776041
Epoch: 3 , loss: 14.102981885274252
Epoch: 4 , loss: 13.14046859741211
Epoch: 5 , loss: 12.020598305596245
Epoch: 6 , loss: 10.783609125349257
Epoch: 7 , loss: 9.804703447553846
Epoch: 8 , loss: 9.114121278127035
Epoch: 9 , loss: 8.790538999769423
Epoch: 10 , loss: 8.35461057557
Epoch: 11 , loss: 8.268216689427694
Epoch: 12 , loss: 8.033705208036634
Epoch: 13 , loss: 8.00889637735155
Epoch: 14 , loss: 7.863510343763563
Epoch: 15 , loss: 7.817133559121026
Epoch: 16 , loss: 7.667694383197361
Epoch: 17 , loss: 7.738150993982951
Epoch: 18 , loss: 7.59298578898112
Epoch: 19 , loss: 7.583406156963772
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 04:01:58,916] Trial 3 finished with value: 153096.41896828968 and parameters: {'layer': [128, 128], 'vae': False, 'samples': 12}. Best is trial 0 with value: 136012.90598441515.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 19.495845794677734
Epoch: 1 , loss: 18.502730051676433
Epoch: 2 , loss: 17.599118126763237
Epoch: 3 , loss: 16.298101160261368
Epoch: 4 , loss: 14.668783929612902
Epoch: 5 , loss: 12.127192444271511
Epoch: 6 , loss: 10.295042779710558
Epoch: 7 , loss: 9.102697902255589
Epoch: 8 , loss: 8.62790200445387
Epoch: 9 , loss: 8.59179589483473
Epoch: 10 , loss: 8.238357411490547
Epoch: 11 , loss: 8.169635746214125
Epoch: 12 , loss: 8.169391605589125
Epoch: 13 , loss: 7.897606902652317
Epoch: 14 , loss: 8.027411513858372
Epoch: 15 , loss: 7.657174958123101
Epoch: 16 , loss: 7.874516937467787
Epoch: 17 , loss: 7.530957645840115
Epoch: 18 , loss: 7.54287510448032
Epoch: 19 , loss: 7.634644481870863
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 04:02:17,092] Trial 4 finished with value: 144608.4053679733 and parameters: {'layer': [512, 256], 'vae': True, 'samples': 11}. Best is trial 0 with value: 136012.90598441515.


Optimization completed!
Best Trial Hyperparameters:
  layer: [512, 256]
  vae: False
  samples: 16
Best Objective Value (aggregated error): 136012.90598441515
Best hyperparameters for MIDAS: {'layer': [512, 256], 'vae': False, 'samples': 16} with best agg error of 136012.90598441515


 67%|██████▋   | 2/3 [00:15<00:07,  7.58s/it]
100%|██████████| 2/2 [00:00<00:00,  2.77it/s]


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 15.636018011305067
Epoch: 1 , loss: 12.337935977511936
Epoch: 2 , loss: 9.15252192815145
Epoch: 3 , loss: 8.159546030892265
Epoch: 4 , loss: 7.996194495095147
Epoch: 5 , loss: 7.822096639209324
Epoch: 6 , loss: 7.579149934980604
Epoch: 7 , loss: 7.49990635448032
Epoch: 8 , loss: 7.326392465167576
Epoch: 9 , loss: 7.373434411154853
Epoch: 10 , loss: 7.259839322831896
Epoch: 11 , loss: 7.19220945570204
Epoch: 12 , loss: 7.1859373516506615
Epoch: 13 , loss: 7.138576904932658
Epoch: 14 , loss: 7.124428616629706
Epoch: 15 , loss: 7.08519098493788
Epoch: 16 , loss: 7.072405444251166
Epoch: 17 , loss: 7.102920293807983
Epoch: 18 , loss: 7.054459280437893
Epoch: 19 , loss: 6.948911666870117
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


In [29]:
table

Unnamed: 0,Column,Data Type,Best Method,Metric,Error_SD,Max_Error,Min_Error,Within_10pct
0,Dm2,Categorical,KNN,0.8888889,,,,
1,Dm4,Discrete,MIDAS_16,9.852941,9.439223,39.0,0.0,0.382353
2,E11,Discrete,KNN,7.206897,6.073239,25.0,0.0,0.62069
3,E12,Discrete,MissForest,6.217391,4.252876,14.0,1.0,0.652174
4,E21,Discrete,MissForest,1.685714,1.936817,8.0,0.0,0.714286
5,E22,Discrete,MissForest,4.464286,11.53251,62.0,0.0,0.607143
6,E31,Discrete,KNN,13.82143,13.2806,54.0,2.0,0.642857
7,E32,Discrete,KNN,7.583333,6.565603,31.0,0.0,0.833333
8,E41,Discrete,KNN,7.234043,5.418217,25.0,0.0,0.595745
9,E42,Discrete,KNN,7.230769,4.942127,17.0,1.0,0.576923
