In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import optuna
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import time
from sklearn.metrics import mean_squared_error
import MIDASpy as md

### Data Preparation Function

# def prep(df: pd.DataFrame):
#     """
#     Preprocess the DataFrame by:
#     - Dropping rows with missing values and resetting the index.
#     - Converting object columns to categorical via LabelEncoder.
#     - Converting other columns to float (and then to int if >50% of values are integer-like).
#     - If any numeric column (not already marked as categorical) has only 2 unique values,
#       it is considered categorical and encoded.

#     Returns:
#         categorical_cols (list): List of columns encoded as categorical.
#         discrete_cols (list): List of columns that are numeric and integer-like.
#         cont_cols (list): List of remaining continuous numeric columns.
#         df_clean (DataFrame): The preprocessed DataFrame.
#         encoders (dict): Mapping from categorical column name to its LabelEncoder.
#     """
#     df_clean = df.dropna().reset_index(drop=True)
#     categorical_cols = []
#     discrete_cols = []
#     encoders = {}

#     for col in df_clean.columns:
#         if df_clean[col].dtype == 'object':
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le
#         else:
#             try:
#                 df_clean[col] = df_clean[col].astype(float)
#                 if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
#                     df_clean[col] = df_clean[col].astype(int)
#                     discrete_cols.append(col)
#             except (ValueError, TypeError):
#                 categorical_cols.append(col)
#                 le = LabelEncoder()
#                 df_clean[col] = le.fit_transform(df_clean[col])
#                 encoders[col] = le

#     for col in df_clean.columns:
#         if col not in categorical_cols and df_clean[col].nunique() == 2:
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le

#     continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

#     return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders
def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
    - Dropping rows with missing values and resetting the index.
    - Converting object columns to categorical via LabelEncoder.
    - Converting other columns to float (and then to int if >50% of values are integer-like).
    - If any numeric column (not already marked as categorical) has only 2 unique values,
      it is considered categorical and encoded.

    Returns:
        continuous_cols (list): List of remaining continuous numeric columns.
        discrete_cols (list): List of columns that are numeric and integer-like.
        categorical_cols (list): List of columns encoded as categorical.
        df_clean (DataFrame): The preprocessed DataFrame.
        encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values.
    df_clean = df.dropna().reset_index(drop=True)
    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Loop over each column to check its type and convert accordingly.
    for col in df_clean.columns:
        # If the column type is object, encode it as a categorical variable.
        if df_clean[col].dtype == 'object' or df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float first.
                df_clean[col] = df_clean[col].astype(float)
                # Check if most of the values are integer-like using np.isclose.
                # This computes the proportion of values where the modulus with 1 is nearly 0.
                if (np.isclose(df_clean[col] % 1, 0)).mean() > 0.5:
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion to float fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # # After the loop, check for numeric columns that have only two unique values.
    # # If found, treat them as categorical.
    # for col in df_clean.columns:
    #     if col not in categorical_cols and df_clean[col].nunique() == 2:
    #         categorical_cols.append(col)
    #         le = LabelEncoder()
    #         df_clean[col] = le.fit_transform(df_clean[col])
    #         encoders[col] = le

    # Determine continuous columns as those not flagged as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame with encoded categorical columns.
        encoders (dict): Dictionary mapping column names to their LabelEncoder.
    
    Returns:
        pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: Original DataFrame, DataFrame with missing values, and a mask DataFrame.
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

def simulate_missingness(df, show_missingness=False):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=42).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

In [18]:
df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')
df = df.drop(columns=['نامونامخانوادگی'],axis=1)
continuous_cols, discrete_cols, categorical_cols, df, encoders = prep(df)
df, df2, missing_mask = create_missings(df, 60)

In [2]:
def do_knn(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False):
    """
    Impute missing values using KNN imputation over all columns.

    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        n_neighbors (int): Number of neighbors for KNN.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()

    # Optionally scale all numeric columns
    if scale:
        scaler = MinMaxScaler()
        df_imputed[df.columns] = scaler.fit_transform(df_imputed)

    # Apply KNN imputation to the entire dataframe
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed[df.columns] = imputer.fit_transform(df_imputed)

    # Reverse scale if needed
    if scale:
        df_imputed[df.columns] = scaler.inverse_transform(df_imputed)

    # Post-process: round discrete and categorical values
    if discrete_cols:
        df_imputed[discrete_cols] = np.round(df_imputed[discrete_cols]).astype(int)
    if categorical_cols:
        df_imputed[categorical_cols] = np.round(df_imputed[categorical_cols]).astype(int)

    return df_imputed

In [4]:
# knn_imputed = do_knn(df2, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False)

In [3]:
def do_mice(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
            iters=10, strat='normal', scale=False):
    """
    Impute missing values in a DataFrame using the MICE forest method.

    Parameters:
        df (pd.DataFrame): Input DataFrame with missing values.
        continuous_cols (list of str): Names of continuous numeric columns.
        discrete_cols (list of str): Names of discrete numeric columns.
        categorical_cols (list of str): Names of categorical columns.
        iters (int): Number of MICE iterations.
        strat: ['normal', 'shap', 'fast'] or a dictionary specifying the mean matching strategy.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()

    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])

    kernel = mf.ImputationKernel(
        df_imputed,
        random_state=0,
        mean_match_strategy=strat,
        variable_schema=None,  # Explicitly set variable_schema to None 
        )

    kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
    df_completed = kernel.complete_data(dataset=0)

    if discrete_cols:
        df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)
    if categorical_cols:
        df_completed[categorical_cols] = df_completed[categorical_cols].round().astype(int)

    if scale:
        scaler = MinMaxScaler()
        df_completed[continuous_cols] = scaler.inverse_transform(df_completed[continuous_cols])

    return df_completed

In [6]:
# mice_imputed = do_mice(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols,
                    #    iters=10, strat='normal', scale=False)

In [4]:

def do_mf(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5, scale=False):
    """
    Impute missing values using MissForest.
    
    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        iters (int): Maximum number of iterations.
        scale (bool): Whether to apply MinMaxScaler before imputation.
    
    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()
    
    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])
    
    imputer = MissForest(max_iter=iters, categorical=categorical_cols)
    df_imputed_result = imputer.fit_transform(df_imputed)
    
    if discrete_cols:
        df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)
    
    if categorical_cols:
        df_imputed_result[categorical_cols] = df_imputed_result[categorical_cols].round().astype(int)
    
    if scale:
        # Reverse scaling for continuous columns
        df_imputed_result[continuous_cols] = scaler.inverse_transform(df_imputed_result[continuous_cols])
    
    return df_imputed_result

# mf_imputed = do_mf(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, iters=5, scale=False)

In [5]:
def do_midas(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
              layer:list=[256,256], vae:bool=True, samples:int=10, random_seed:float=96 ):
    """
    Imputes missing values using the MIDAS model.
    
    Parameters:
      df (pd.DataFrame): Input dataframe.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but non-continuous) column names.
      categorical_cols (list): List of categorical column names.
      
    Returns:
      imps (list): A list of imputed dataframes.
    """
    # 1. Convert categorical columns and get categorical metadata.
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])
    
    # 2. Define the numeric columns.
    num_cols = discrete_cols + continuous_cols  # these are the numeric columns

    # 3. Drop original categorical columns and combine with the converted categorical data.
    df_copy = df.drop(columns=categorical_cols,axis=1)
    constructor_list = [df_copy, md_cat_data]
    data_in = pd.concat(constructor_list, axis=1)
    
    # 4. Scale non-categorical columns BEFORE imputation.
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])
    
    # 5. Build and train the imputer using the scaled data.
    imputer = md.Midas(layer_structure=layer, vae_layer=vae, seed=random_seed, input_drop=0.75)
    # Use md_cats as softmax columns for categorical outputs.
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)
    
    # 6. Generate imputations.
    imps = imputer.generate_samples(m=samples).output_list
    
    # 7. Post-process each imputed DataFrame.
    for idx, imp_df in enumerate(imps):
        # Reverse transform the numeric columns.
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])
        
        # Process categorical columns.
        # For each softmax group in md_cats, choose the column with the highest probability.
        tmp_cat = []
        for group in md_cats:
            # idxmax returns the column name with maximum value per row for this group.
            tmp_cat.append(imp_df[group].idxmax(axis=1))
        # Assume the order of md_cats corresponds to categorical_cols.
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        
        # Drop the softmax columns.
        flat_cats = [col for group in md_cats for col in group]
        tmp_cat = [imp_df[x].idxmax(axis=1) for x in md_cats]
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        imp_df = pd.concat([imp_df, cat_df], axis=1).drop(columns=flat_cats, axis=1)
        
        # Handle discrete data by rounding the values.
        imp_df[discrete_cols] = imp_df[discrete_cols].round()
        
        # Replace the processed DataFrame in the list.
        imps[idx] = imp_df

        ### make method info
        method_info = f'MIDAS, params: samples={samples} ,layer={layer}, vae={vae}'
    return imps, method_info

In [9]:
# midas_imputed = do_midas(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [10]:
import optuna

In [5]:
# ------------------------------------------------------------------------------
# Missingness Creation Function
# ------------------------------------------------------------------------------
def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

# ------------------------------------------------------------------------------
# Improved Evaluation Function
# ------------------------------------------------------------------------------
def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error

# ------------------------------------------------------------------------------
# Hyperparameter Optimization Function using Optuna
# ------------------------------------------------------------------------------
def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    missing_percent, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, summary_table, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [19]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df,missing_percent=20,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

[I 2025-04-14 11:25:51,470] A new study created in memory with name: no-name-5daafcc4-498d-4e71-9799-8a077f4c244d


[I 2025-04-14 11:25:52,005] Trial 0 finished with value: 12.147824823621622 and parameters: {'n_neighbors': 10, 'scale': True}. Best is trial 0 with value: 12.147824823621622.
[I 2025-04-14 11:25:52,494] Trial 1 finished with value: 12.844363889321443 and parameters: {'n_neighbors': 15, 'scale': True}. Best is trial 0 with value: 12.147824823621622.
[I 2025-04-14 11:25:52,996] Trial 2 finished with value: 11.079757802848631 and parameters: {'n_neighbors': 6, 'scale': True}. Best is trial 2 with value: 11.079757802848631.
[I 2025-04-14 11:25:53,372] Trial 3 finished with value: 10.64311847733934 and parameters: {'n_neighbors': 3, 'scale': False}. Best is trial 3 with value: 10.64311847733934.
[I 2025-04-14 11:25:53,797] Trial 4 finished with value: 11.096930132690385 and parameters: {'n_neighbors': 6, 'scale': False}. Best is trial 3 with value: 10.64311847733934.
[I 2025-04-14 11:25:54,202] Trial 5 finished with value: 11.096930132690385 and parameters: {'n_neighbors': 6, 'scale': Fals

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 4
  scale: False
Best Objective Value (aggregated error): 10.47046431270568


In [None]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=df,missing_percent=20,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [None]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_midas,original_df=df,missing_percent=20,timelimit=300,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [None]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mice,original_df=df,missing_percent=20,timelimit=300,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [6]:
new_df = pd.read_excel(r"C:\Users\Matin\Downloads\Data for Dr.Matin.xlsx", 's1')
new_df.drop(['n', 'ID','Gen.code'],axis=1,inplace=True)
continuous_cols, discrete_cols, categorical_cols, df2, encoders = prep(new_df)


In [10]:
discrete_cols

['Dm4',
 'E11',
 'E12',
 'E21',
 'E22',
 'E31',
 'E32',
 'E41',
 'E42',
 'E24',
 'E25',
 'E26',
 'E27',
 'FastingBloodSugar',
 'Glucose2hpp',
 'Cholestrol',
 'Triglycerides',
 'HDL',
 'LDL',
 'W.B.C',
 'Platelets',
 'DBP',
 'SBP',
 'gdi',
 'work_activity',
 'transport',
 'lesiretime']

In [9]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df2,missing_percent=30,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

[I 2025-04-17 00:48:47,368] A new study created in memory with name: no-name-c7f21880-10ec-48f1-9a0c-390fa32a9351


[I 2025-04-17 00:48:48,711] Trial 0 finished with value: 128853.83079579413 and parameters: {'n_neighbors': 8, 'scale': False}. Best is trial 0 with value: 128853.83079579413.
[I 2025-04-17 00:48:50,077] Trial 1 finished with value: 127580.80474204637 and parameters: {'n_neighbors': 11, 'scale': False}. Best is trial 1 with value: 127580.80474204637.
[I 2025-04-17 00:48:51,376] Trial 2 finished with value: 123864.30822936769 and parameters: {'n_neighbors': 4, 'scale': True}. Best is trial 2 with value: 123864.30822936769.
[I 2025-04-17 00:48:52,691] Trial 3 finished with value: 122415.90668441085 and parameters: {'n_neighbors': 6, 'scale': True}. Best is trial 3 with value: 122415.90668441085.
[I 2025-04-17 00:48:54,019] Trial 4 finished with value: 120331.31492166221 and parameters: {'n_neighbors': 9, 'scale': True}. Best is trial 4 with value: 120331.31492166221.
[I 2025-04-17 00:48:55,349] Trial 5 finished with value: 126572.80990000315 and parameters: {'n_neighbors': 13, 'scale': F

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 12
  scale: True
Best Objective Value (aggregated error): 119049.68301431432


In [26]:
df2

Unnamed: 0,n,Gen.code,ID,Dm2,Dm4,E11,E12,E21,E22,E31,...,Neutrophils,Lymphocyte,Mixed,Platelets,DBP,SBP,gdi,work_activity,transport,lesiretime
0,1,1,1090,1,36,80,75,30,20,130,...,35.8,52.6,11.6,249,67,120,6,34560,2391444,1440
1,2,0,743,0,59,65,62,16,16,132,...,52.5,36.4,11.1,226,76,128,5,7174332,360,7174332
2,3,0,242,0,58,83,85,17,17,120,...,58.8,32.1,9.1,288,70,120,5,4783608,480,7174332
3,4,1,33,0,61,69,70,29,29,140,...,52.4,37.8,9.8,240,70,135,7,4785408,2520,2520
4,5,1,1790,1,28,73,75,17,18,131,...,40.5,47.7,11.8,129,85,126,7,4799688,1680,7174332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886,1887,0,219,1,62,81,86,18,18,109,...,51.1,37.0,11.9,247,74,109,3,4796328,840,4783608
1887,1888,0,499,0,58,81,78,17,18,127,...,52.3,39.7,8.0,197,74,121,5,7174332,840,7174332
1888,1889,2,317,0,30,87,87,17,18,100,...,57.9,34.9,7.2,223,60,100,5,960,240,4782888
1889,1890,0,1331,1,56,84,80,16,16,130,...,54.4,32.8,12.8,229,80,125,6,4783608,1680,17280


In [11]:
best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=new_df,missing_percent=30, timelimit=300,
                                                        continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

[I 2025-04-15 14:07:37,661] A new study created in memory with name: no-name-4b4570ca-8dde-44c1-8284-36f7b9ac25b2
100%|██████████| 3/3 [00:32<00:00, 10.84s/it]
100%|██████████| 3/3 [00:01<00:00,  2.46it/s]
[I 2025-04-15 14:08:11,777] Trial 0 finished with value: 114107.11227269446 and parameters: {'iters': 3, 'scale': False}. Best is trial 0 with value: 114107.11227269446.
100%|██████████| 12/12 [02:28<00:00, 12.41s/it]
100%|██████████| 12/12 [00:05<00:00,  2.06it/s]
[I 2025-04-15 14:10:47,191] Trial 1 finished with value: 111581.23221126181 and parameters: {'iters': 12, 'scale': True}. Best is trial 1 with value: 111581.23221126181.
100%|██████████| 11/11 [02:14<00:00, 12.26s/it]
100%|██████████| 11/11 [00:04<00:00,  2.34it/s]
[I 2025-04-15 14:13:07,417] Trial 2 finished with value: 111626.51788094059 and parameters: {'iters': 11, 'scale': True}. Best is trial 1 with value: 111581.23221126181.


Optimization completed!
Best Trial Hyperparameters:
  iters: 12
  scale: True
Best Objective Value (aggregated error): 111581.23221126181


In [16]:
def simulate_missingness(df, show_missingness=False, random_state=42):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=random_state).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df


def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for cat in categorical_cols:
        if cat in best_imputed_df:
            best_imputed_df[cat] = best_imputed_df[cat].astype(object)

    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = \
                imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error


def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    df_missing, 
                                    mask_df, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    # _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, _, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [17]:
def do_midas(df,
             continuous_cols=None,
             discrete_cols=None,
             categorical_cols=None,
             layer: list = [256, 256],
             vae: bool = True,
             samples: int = 10,
             random_seed: float = 96):
    """
    Imputes missing values using the MIDAS model.

    Parameters:
      df (pd.DataFrame): Input dataframe with NaNs in both numeric & categorical.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but not continuous) column names.
      categorical_cols (list): List of categorical column names.

    Returns:
      imps (list): A list of imputed dataframes, with original dtypes restored.
      method_info (str): Summary of MIDAS params used.
    """
    # 1. One‑hot encode the categoricals
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])

    # 2. Build the “wide” DF: drop raw cats, append one‑hots
    df_num = df.drop(columns=categorical_cols)
    data_in = pd.concat([df_num, md_cat_data], axis=1)

    # 3. Record & re‑insert the NaN locations so MIDAS sees them as missing
    na_mask = data_in.isnull()
    data_in[na_mask] = np.nan

    # 4. Scale only the numeric columns in place
    num_cols = discrete_cols + continuous_cols
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

    # 5. Build & train the MIDAS model
    imputer = md.Midas(
        layer_structure=layer,
        vae_layer=vae,
        seed=random_seed,
        input_drop=0.75
    )
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)

    # 6. Generate multiple imputations
    raw_imps = imputer.generate_samples(m=samples).output_list

    # 7. Decode each imputed DF back to original structure
    flat_cats = [c for grp in md_cats for c in grp]
    imps = []

    for imp_df in raw_imps:
        # 7a. inverse‑scale numeric cols
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])

        # 7b. decode one‑hots (before dropping them!)
        decoded = {}
        for i, grp in enumerate(md_cats):
            # just in case, only keep those actually present
            present = [c for c in grp if c in imp_df.columns]
            # idxmax → gives the dummy column name with highest prob
            decoded[categorical_cols[i]] = imp_df[present].idxmax(axis=1)

        cat_df = pd.DataFrame(decoded, index=imp_df.index)

        # 7c. now drop the dummy cols
        base = imp_df.drop(columns=flat_cats, errors='ignore')

        # 7d. concat in your decoded cat columns
        merged = pd.concat([base, cat_df], axis=1)

        # 7e. round discrete cols
        merged[discrete_cols] = merged[discrete_cols].round().astype(int)

        imps.append(merged)

    method_info = f"MIDAS, params: samples={samples}, layer={layer}, vae={vae}"
    return imps, method_info


def run_full_pipeline(df: pd.DataFrame, 
                      simulate:bool=False,               # True for simulated missingness, False for random missingness
                      missingness_value: float = 10.0,   # used only for random missingness (percent)
                      show_missingness: bool = False,
                      timelimit: int = 600, 
                      min_trials: int = 20, 
                      random_seed: int = 96):
    """
    Run the full pipeline to find the best hyperparameters for each imputation method.

    The pipeline performs these steps:
    
      1. Preprocesses the DataFrame using `prep`, which cleans the data,
         encodes categorical variables, and splits features into continuous,
         discrete, and categorical lists.
      2. Introduces missingness using either simulated missingness (reintroducing missingness 
         based on the original NaN proportions) or random missingness (dropping values randomly
         given a specified missing percentage).
      3. Runs hyperparameter optimization (via `optimize_imputation_hyperparams`) for each candidate 
         imputation method (e.g., do_knn, do_mice, do_mf, do_midas).
         
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        missing_type (str): "simulate" to simulate missingness using original missing proportions,
                            "random" to drop values randomly.
        missingness_value (float): Percentage of missingness (only used if missing_type == "random").
        show_missingness (bool): If True, prints missingness comparison when using simulate missingness.
        timelimit (int): Time limit (in seconds) for each hyperparameter optimization study.
        min_trials (int): Minimum number of trials for each study.
        random_seed (int): Random seed for reproducibility.

    Returns:
        dict: A dictionary where the keys are method names (strings) and the values are the best 
              hyperparameter dictionaries (from the best Optuna trial) for that method.
    """

    # Step 1: Preprocess Data
    continuous_cols, discrete_cols, categorical_cols, df_clean, encoders = prep(df)
    
    # Step 2: Create Missingness
    # Note: For simulation, the missing proportions are taken from the original df.
    if simulate: 
        # simulate_missingness returns: (complete_df, df_with_missing, missing_mask)
        original_complete, df_missing, mask_df = simulate_missingness(df, 
                                                                      show_missingness=show_missingness,
                                                                      random_state=random_seed)
    else:   
        original_complete, df_missing, mask_df = create_missings(df_clean, 
                                                                 missingness=missingness_value, 
                                                                 random_seed=random_seed)
        
    # The original_complete DataFrame is assumed to be ground truth for evaluation.
    # If missing_type=="simulate", original_complete is the complete-case subset from the original data.
    # For random, the df_clean (preprocessed and complete) is used and missingness is artificially introduced.

    # Step 3: Define candidate imputation methods to optimize.
    # It is assumed that these functions are defined: do_knn, do_mice, do_mf, do_midas.
    candidate_methods = {
        "KNN": do_knn,
        "MICE": do_mice,
        "MissForest": do_mf,
        "MIDAS": do_midas
    }
    
    best_hyperparams = {}
    
    # Optimize hyperparameters for each imputation method candidate.
    for method_name, imputation_func in candidate_methods.items():
        print(f"\nOptimizing hyperparameters for {method_name}...")
        try:
            best_trial, best_value = optimize_imputation_hyperparams(
                imputation_func=imputation_func,
                original_df=original_complete,
                df_missing=df_missing,
                mask_df=mask_df,
                continuous_cols=continuous_cols,
                discrete_cols=discrete_cols,
                categorical_cols=categorical_cols,
                timelimit=timelimit,
                min_trials=min_trials,
                random_seed=random_seed
            )
            best_hyperparams[method_name] = best_trial.params
        except Exception as e:
            print(f"An error occurred while optimizing {method_name}: {e}")
            best_hyperparams[method_name] = None
    
    return best_hyperparams


In [18]:
best_params = run_full_pipeline(new_df,timelimit=60,random_seed=96)

[I 2025-04-17 01:49:48,645] A new study created in memory with name: no-name-fd3a6ac4-36a8-4e26-b961-5db36ce10509



Optimizing hyperparameters for KNN...


[I 2025-04-17 01:49:49,551] Trial 0 finished with value: 118879.16867876558 and parameters: {'n_neighbors': 14, 'scale': True}. Best is trial 0 with value: 118879.16867876558.
[I 2025-04-17 01:49:50,359] Trial 1 finished with value: 119277.57665411044 and parameters: {'n_neighbors': 12, 'scale': True}. Best is trial 0 with value: 118879.16867876558.
[I 2025-04-17 01:49:51,152] Trial 2 finished with value: 120520.94788333244 and parameters: {'n_neighbors': 4, 'scale': True}. Best is trial 0 with value: 118879.16867876558.
[I 2025-04-17 01:49:52,028] Trial 3 finished with value: 119163.2803176755 and parameters: {'n_neighbors': 10, 'scale': True}. Best is trial 0 with value: 118879.16867876558.
[I 2025-04-17 01:49:52,828] Trial 4 finished with value: 118637.68422901373 and parameters: {'n_neighbors': 15, 'scale': True}. Best is trial 4 with value: 118637.68422901373.
[I 2025-04-17 01:49:53,634] Trial 5 finished with value: 131986.46211469793 and parameters: {'n_neighbors': 13, 'scale': F

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 15
  scale: True
Best Objective Value (aggregated error): 118637.68422901373

Optimizing hyperparameters for MICE...


  bachelor_preds = bachelor_preds.astype(_PRE_LINK_DATATYPE)
  candidate_preds = candidate_preds.astype(_PRE_LINK_DATATYPE)  # type: ignore
[W 2025-04-17 01:50:16,150] Trial 0 failed with parameters: {'iters': 13, 'strat': 'shap', 'scale': True} because of the following error: ValueError('data must be finite, check for nan or inf values').
Traceback (most recent call last):
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_8740\3942454051.py", line 216, in objective
    imputed_df = imputation_func(df_missing,
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_8740\746599116.py", line 31, in do_mice
    kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\miceforest\imputation_kernel.py", line 1186, in mice
    imputation

An error occurred while optimizing MICE: data must be finite, check for nan or inf values

Optimizing hyperparameters for MissForest...


100%|██████████| 10/10 [01:50<00:00, 11.06s/it]
100%|██████████| 10/10 [00:03<00:00,  2.76it/s]
[I 2025-04-17 01:52:11,137] Trial 0 finished with value: 115055.00721726334 and parameters: {'iters': 10, 'scale': True}. Best is trial 0 with value: 115055.00721726334.
[I 2025-04-17 01:52:11,139] A new study created in memory with name: no-name-563287aa-6762-4cf4-8c8c-900e9008f337


Optimization completed!
Best Trial Hyperparameters:
  iters: 10
  scale: True
Best Objective Value (aggregated error): 115055.00721726334

Optimizing hyperparameters for MIDAS...
Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 15.93519032203545
Epoch: 1 , loss: 9.922870611740372
Epoch: 2 , loss: 8.405207900677697
Epoch: 3 , loss: 7.580558227280439
Epoch: 4 , loss: 7.005114943294202
Epoch: 5 , loss: 6.631417302762047
Epoch: 6 , loss: 6.278475389642231
Epoch: 7 , loss: 6.117289745201499
Epoch: 8 , loss: 5.962418370327707
Epoch: 9 , loss: 5.876365071636135
Epoch: 10 , loss: 5.787296319411973
Epoch: 11 , loss: 5.700790550749181
Epoch: 12 , loss: 5.653845500137846
Epoch: 13 , loss: 5.618617571006387
Epoch: 14 , loss: 5.613604367789575
Epoch: 15 , loss: 5.611704951625759
Epoch: 16 , loss: 5.554807873095497
Epoch: 17 , loss: 5.5319306365514205
Epoch: 18 , loss: 5.515414286467989
Epoch: 19 , loss: 5.493824595111912
Training complete. Saving file...
Model

[I 2025-04-17 01:52:55,098] Trial 0 finished with value: 137592.03328454602 and parameters: {'layer': [256, 256], 'vae': True, 'samples': 13}. Best is trial 0 with value: 137592.03328454602.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 14.557120808100295
Epoch: 1 , loss: 9.09260251562474
Epoch: 2 , loss: 7.8573198520531085
Epoch: 3 , loss: 7.113313852730444
Epoch: 4 , loss: 6.603931871511168
Epoch: 5 , loss: 6.285801665257599
Epoch: 6 , loss: 6.068915443905329
Epoch: 7 , loss: 5.876272076267307
Epoch: 8 , loss: 5.788735308889615
Epoch: 9 , loss: 5.729687084585933
Epoch: 10 , loss: 5.6512592970314675
Epoch: 11 , loss: 5.5855839696981135
Epoch: 12 , loss: 5.560512122461351
Epoch: 13 , loss: 5.528206421157061
Epoch: 14 , loss: 5.519819425324262
Epoch: 15 , loss: 5.5105722475860075
Epoch: 16 , loss: 5.473051515676207
Epoch: 17 , loss: 5.4573142851813365
Epoch: 18 , loss: 5.444874674586926
Epoch: 19 , loss: 5.424031071743722
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 01:53:54,803] Trial 1 finished with value: 132424.4626437458 and parameters: {'layer': [512, 256], 'vae': True, 'samples': 13}. Best is trial 1 with value: 132424.4626437458.


Optimization completed!
Best Trial Hyperparameters:
  layer: [512, 256]
  vae: True
  samples: 13
Best Objective Value (aggregated error): 132424.4626437458


In [17]:
best_params

{'KNN': {'n_neighbors': 6, 'scale': True},
 'MICE': {'iters': 6, 'strat': 'normal', 'scale': False},
 'MissForest': {'iters': 6, 'scale': False},
 'MIDAS': {'layer': [128, 128], 'vae': True, 'samples': 19}}

In [14]:
continuous_cols

['E23',
 'Hb.A1C',
 'CreatininUrine',
 'PotassiumUrineRandom',
 'SodiumUrineRandom',
 'R.B.C',
 'Hemoglobin',
 'Hematocrit',
 'MCV',
 'MCH',
 'MCHC',
 'Neutrophils',
 'Lymphocyte',
 'Mixed']

In [13]:
categorical_cols

['Dm2']

In [15]:
discrete_cols

['Dm4',
 'E11',
 'E12',
 'E21',
 'E22',
 'E31',
 'E32',
 'E41',
 'E42',
 'E24',
 'E25',
 'E26',
 'E27',
 'FastingBloodSugar',
 'Glucose2hpp',
 'Cholestrol',
 'Triglycerides',
 'HDL',
 'LDL',
 'W.B.C',
 'Platelets',
 'DBP',
 'SBP',
 'gdi',
 'work_activity',
 'transport',
 'lesiretime']