In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import optuna
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import time
from sklearn.metrics import mean_squared_error
import MIDASpy as md

### Data Preparation Function

# def prep(df: pd.DataFrame):
#     """
#     Preprocess the DataFrame by:
#     - Dropping rows with missing values and resetting the index.
#     - Converting object columns to categorical via LabelEncoder.
#     - Converting other columns to float (and then to int if >50% of values are integer-like).
#     - If any numeric column (not already marked as categorical) has only 2 unique values,
#       it is considered categorical and encoded.

#     Returns:
#         categorical_cols (list): List of columns encoded as categorical.
#         discrete_cols (list): List of columns that are numeric and integer-like.
#         cont_cols (list): List of remaining continuous numeric columns.
#         df_clean (DataFrame): The preprocessed DataFrame.
#         encoders (dict): Mapping from categorical column name to its LabelEncoder.
#     """
#     df_clean = df.dropna().reset_index(drop=True)
#     categorical_cols = []
#     discrete_cols = []
#     encoders = {}

#     for col in df_clean.columns:
#         if df_clean[col].dtype == 'object':
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le
#         else:
#             try:
#                 df_clean[col] = df_clean[col].astype(float)
#                 if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
#                     df_clean[col] = df_clean[col].astype(int)
#                     discrete_cols.append(col)
#             except (ValueError, TypeError):
#                 categorical_cols.append(col)
#                 le = LabelEncoder()
#                 df_clean[col] = le.fit_transform(df_clean[col])
#                 encoders[col] = le

#     for col in df_clean.columns:
#         if col not in categorical_cols and df_clean[col].nunique() == 2:
#             categorical_cols.append(col)
#             le = LabelEncoder()
#             df_clean[col] = le.fit_transform(df_clean[col])
#             encoders[col] = le

#     continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

#     return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders
def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
    - Dropping rows with missing values and resetting the index.
    - Converting object columns to categorical via LabelEncoder.
    - Converting other columns to float (and then to int if >50% of values are integer-like).
    - If any numeric column (not already marked as categorical) has only 2 unique values,
      it is considered categorical and encoded.

    Returns:
        continuous_cols (list): List of remaining continuous numeric columns.
        discrete_cols (list): List of columns that are numeric and integer-like.
        categorical_cols (list): List of columns encoded as categorical.
        df_clean (DataFrame): The preprocessed DataFrame.
        encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values.
    df_clean = df.dropna().reset_index(drop=True)
    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Loop over each column to check its type and convert accordingly.
    for col in df_clean.columns:
        # If the column type is object, encode it as a categorical variable.
        if df_clean[col].dtype == 'object' or df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float first.
                df_clean[col] = df_clean[col].astype(float)
                # Check if most of the values are integer-like using np.isclose.
                # This computes the proportion of values where the modulus with 1 is nearly 0.
                if (np.isclose(df_clean[col] % 1, 0)).mean() > 0.5:
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion to float fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # # After the loop, check for numeric columns that have only two unique values.
    # # If found, treat them as categorical.
    # for col in df_clean.columns:
    #     if col not in categorical_cols and df_clean[col].nunique() == 2:
    #         categorical_cols.append(col)
    #         le = LabelEncoder()
    #         df_clean[col] = le.fit_transform(df_clean[col])
    #         encoders[col] = le

    # Determine continuous columns as those not flagged as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.
    
    Parameters:
        df (pd.DataFrame): DataFrame with encoded categorical columns.
        encoders (dict): Dictionary mapping column names to their LabelEncoder.
    
    Returns:
        pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: Original DataFrame, DataFrame with missing values, and a mask DataFrame.
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

def simulate_missingness(df, show_missingness=False):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=42).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

In [2]:
# df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')
# df = df.drop(columns=['نامونامخانوادگی'],axis=1)
# continuous_cols, discrete_cols, categorical_cols, df, encoders = prep(df)
# df, df2, missing_mask = create_missings(df, 60)

In [3]:
def do_knn(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False):
    """
    Impute missing values using KNN imputation over all columns.

    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        n_neighbors (int): Number of neighbors for KNN.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()

    # Optionally scale all numeric columns
    if scale:
        scaler = MinMaxScaler()
        df_imputed[df.columns] = scaler.fit_transform(df_imputed)

    # Apply KNN imputation to the entire dataframe
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed[df.columns] = imputer.fit_transform(df_imputed)

    # Reverse scale if needed
    if scale:
        df_imputed[df.columns] = scaler.inverse_transform(df_imputed)

    # Post-process: round discrete and categorical values
    if discrete_cols:
        df_imputed[discrete_cols] = np.round(df_imputed[discrete_cols]).astype(int)
    if categorical_cols:
        df_imputed[categorical_cols] = np.round(df_imputed[categorical_cols]).astype(int)

    return df_imputed

In [4]:
# knn_imputed = do_knn(df2, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5, scale=False)

In [5]:
def do_mice(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
            iters=10, strat='normal', scale=False):
    """
    Impute missing values in a DataFrame using the MICE forest method.

    Parameters:
        df (pd.DataFrame): Input DataFrame with missing values.
        continuous_cols (list of str): Names of continuous numeric columns.
        discrete_cols (list of str): Names of discrete numeric columns.
        categorical_cols (list of str): Names of categorical columns.
        iters (int): Number of MICE iterations.
        strat: ['normal', 'shap', 'fast'] or a dictionary specifying the mean matching strategy.
        scale (bool): Whether to apply MinMaxScaler before imputation.

    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()

    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])

    kernel = mf.ImputationKernel(
        df_imputed,
        random_state=0,
        mean_match_strategy=strat,
        variable_schema=None,  # Explicitly set variable_schema to None 
        )

    kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
    df_completed = kernel.complete_data(dataset=0)

    if discrete_cols:
        df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)
    if categorical_cols:
        df_completed[categorical_cols] = df_completed[categorical_cols].round().astype(int)

    if scale:
        scaler = MinMaxScaler()
        df_completed[continuous_cols] = scaler.inverse_transform(df_completed[continuous_cols])

    return df_completed

In [6]:
# mice_imputed = do_mice(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols,
                    #    iters=10, strat='normal', scale=False)

In [7]:

def do_mf(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5, scale=False):
    """
    Impute missing values using MissForest.
    
    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        continuous_cols (list): Names of continuous numeric columns.
        discrete_cols (list): Names of discrete numeric columns.
        categorical_cols (list): Names of categorical columns.
        iters (int): Maximum number of iterations.
        scale (bool): Whether to apply MinMaxScaler before imputation.
    
    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()
    
    if scale:
        scaler = MinMaxScaler()
        df_imputed[continuous_cols] = scaler.fit_transform(df_imputed[continuous_cols])
    
    imputer = MissForest(max_iter=iters, categorical=categorical_cols)
    df_imputed_result = imputer.fit_transform(df_imputed)
    
    if discrete_cols:
        df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)
    
    if categorical_cols:
        df_imputed_result[categorical_cols] = df_imputed_result[categorical_cols].round().astype(int)
    
    if scale:
        # Reverse scaling for continuous columns
        df_imputed_result[continuous_cols] = scaler.inverse_transform(df_imputed_result[continuous_cols])
    
    return df_imputed_result

# mf_imputed = do_mf(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols, iters=5, scale=False)

In [8]:
def do_midas(df, continuous_cols=None, discrete_cols=None, categorical_cols=None,
              layer:list=[256,256], vae:bool=True, samples:int=10, random_seed:float=96 ):
    """
    Imputes missing values using the MIDAS model.
    
    Parameters:
      df (pd.DataFrame): Input dataframe.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but non-continuous) column names.
      categorical_cols (list): List of categorical column names.
      
    Returns:
      imps (list): A list of imputed dataframes.
    """
    # 1. Convert categorical columns and get categorical metadata.
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])
    
    # 2. Define the numeric columns.
    num_cols = discrete_cols + continuous_cols  # these are the numeric columns

    # 3. Drop original categorical columns and combine with the converted categorical data.
    df_copy = df.drop(columns=categorical_cols,axis=1)
    constructor_list = [df_copy, md_cat_data]
    data_in = pd.concat(constructor_list, axis=1)
    
    # 4. Scale non-categorical columns BEFORE imputation.
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])
    
    # 5. Build and train the imputer using the scaled data.
    imputer = md.Midas(layer_structure=layer, vae_layer=vae, seed=random_seed, input_drop=0.75)
    # Use md_cats as softmax columns for categorical outputs.
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)
    
    # 6. Generate imputations.
    imps = imputer.generate_samples(m=samples).output_list
    
    # 7. Post-process each imputed DataFrame.
    for idx, imp_df in enumerate(imps):
        # Reverse transform the numeric columns.
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])
        
        # Process categorical columns.
        # For each softmax group in md_cats, choose the column with the highest probability.
        tmp_cat = []
        for group in md_cats:
            # idxmax returns the column name with maximum value per row for this group.
            tmp_cat.append(imp_df[group].idxmax(axis=1))
        # Assume the order of md_cats corresponds to categorical_cols.
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        
        # Drop the softmax columns.
        flat_cats = [col for group in md_cats for col in group]
        tmp_cat = [imp_df[x].idxmax(axis=1) for x in md_cats]
        cat_df = pd.DataFrame({categorical_cols[j]: tmp_cat[j] for j in range(len(categorical_cols))})
        imp_df = pd.concat([imp_df, cat_df], axis=1).drop(columns=flat_cats, axis=1)
        
        # Handle discrete data by rounding the values.
        imp_df[discrete_cols] = imp_df[discrete_cols].round()
        
        # Replace the processed DataFrame in the list.
        imps[idx] = imp_df

        ### make method info
        method_info = f'MIDAS, params: samples={samples} ,layer={layer}, vae={vae}'
    return imps, method_info

In [9]:
# midas_imputed = do_midas(df2, continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [10]:
import optuna

In [11]:
# ------------------------------------------------------------------------------
# Missingness Creation Function
# ------------------------------------------------------------------------------
def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df

# ------------------------------------------------------------------------------
# Improved Evaluation Function
# ------------------------------------------------------------------------------
def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error

# ------------------------------------------------------------------------------
# Hyperparameter Optimization Function using Optuna
# ------------------------------------------------------------------------------
def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    missing_percent, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, summary_table, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [12]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df,missing_percent=20,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [13]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=df,missing_percent=20,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [14]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_midas,original_df=df,missing_percent=20,timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [15]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mice,original_df=df,missing_percent=20,timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [16]:
new_df = pd.read_excel(r"C:\Users\Matin\Downloads\Data for Dr.Matin.xlsx", 's1')
new_df.drop(['n', 'ID','Gen.code'],axis=1,inplace=True)
new_df = new_df[:300]
# continuous_cols, discrete_cols, categorical_cols, df2, encoders = prep(new_df)


In [17]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_knn,original_df=df2,missing_percent=30,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [18]:
# best_method, best_val = optimize_imputation_hyperparams(imputation_func=do_mf,original_df=new_df,missing_percent=30, timelimit=300,
#                                                         continuous_cols=continuous_cols, discrete_cols=discrete_cols, categorical_cols=categorical_cols)

In [19]:
def simulate_missingness(df, show_missingness=False, random_state=42):
    """
    Simulate missingness by dropping rows with missing values and reintroducing them.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        show_missingness (bool): If True, prints missingness percentages.
    
    Returns:
        tuple: Original DataFrame without missing values, simulated DataFrame with missingness, and a mask.
    """
    missing_original = df.isna().mean()
    df2 = df.dropna().reset_index(drop=True)
    df3 = df2.copy()
    missing_mask = pd.DataFrame(False, index=df3.index, columns=df3.columns)

    for col in df3.columns:
        n_missing = int(round(missing_original[col] * len(df3)))
        if n_missing > 0:
            missing_indices = df3.sample(n=n_missing, random_state=random_state).index
            df3.loc[missing_indices, col] = np.nan
            missing_mask.loc[missing_indices, col] = True

    if show_missingness:
        missing_df3 = df3.isna().mean()
        print("Missingness Comparison:")
        for col in df.columns:
            print(f"Column '{col}': Original: {missing_original[col]*100:.2f}% \t -> \t df3: {missing_df3[col]*100:.2f}%")

    return df2, df3, missing_mask

def create_missings(df: pd.DataFrame, missingness: float, random_seed: float = 96):
    """
    Create random missingness in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missingness (float): Percentage of missing values to introduce.
        random_seed (float): Seed for reproducibility.
    
    Returns:
        tuple: (original DataFrame, DataFrame with missing values, mask DataFrame)
    """
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df, df_missing, mask_df


def select_best_imputations(imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols, method_info=None, method_names=None):
    """
    Evaluate one or several imputed DataFrames and determine an aggregated error.

    For each column with simulated missing data (per mask_df), numeric columns
    are scored using Mean Absolute Error (MAE) while categorical columns are scored
    by misclassification rate (1 - accuracy). An overall aggregated error is returned,
    which is the mean error over all evaluated columns.

    Parameters:
      imputed_dfs (list of pd.DataFrame): A list of imputed DataFrames.
      original_df (pd.DataFrame): The original (complete) DataFrame.
      mask_df (pd.DataFrame): Boolean DataFrame with True at positions where values are masked.
      continuous_cols (list): List of continuous numeric column names.
      discrete_cols (list): List of discrete numeric column names.
      categorical_cols (list): List of categorical column names.
      method_info (str, optional): Text description of the method and its hyperparameters.
      method_names (list, optional): List of names for each imputation method candidate.

    Returns:
      best_imputed_df (pd.DataFrame): A DataFrame where, for each column with missing values,
                                     the candidate with the lowest error is chosen.
      summary_table (pd.DataFrame): A summary table with metrics for each column.
      aggregated_error (float): The average error across columns (lower is better).
    """
    n_methods = len(imputed_dfs)
    
    if method_info is not None:
        parts = method_info.split(',')
        base_name = parts[0].strip()
        params = ','.join(parts[1:]).strip() if len(parts) > 1 else ""
        method_names = [f"{base_name} ({params})"] * n_methods
    elif method_names is None:
        method_names = [f"Method {i+1}" for i in range(n_methods)]
    
    summary_list = []
    best_method_per_col = {}

    for col in original_df.columns:
        if col in continuous_cols:
            col_type = "Continuous"
        elif col in discrete_cols:
            col_type = "Discrete"
        elif col in categorical_cols:
            col_type = "Categorical"
        else:
            col_type = str(original_df[col].dtype)

        if mask_df[col].sum() == 0:
            best_method_per_col[col] = None
            summary_list.append({
                'Column': col,
                'Data Type': col_type,
                'Best Method': None,
                'Metric': np.nan,  
            })
            continue

        col_errors = []
        for df_imp in imputed_dfs:
            if col_type in ["Continuous", "Discrete"]:
                try:
                    imp_vals = pd.to_numeric(df_imp[col][mask_df[col]], errors='coerce')
                    orig_vals = pd.to_numeric(original_df[col][mask_df[col]], errors='coerce')
                except Exception as e:
                    imp_vals = df_imp[col][mask_df[col]]
                    orig_vals = original_df[col][mask_df[col]]
                errors = np.abs(imp_vals - orig_vals)
                mae = errors.mean()
                col_errors.append(mae)
            else:
                correct = (df_imp[col][mask_df[col]] == original_df[col][mask_df[col]])
                accuracy = correct.mean()
                col_errors.append(1 - accuracy)

        if col_type in ["Continuous", "Discrete"]:
            best_idx = int(np.nanargmin(col_errors))
        else:
            best_idx = int(np.nanargmin(col_errors))
        best_method = method_names[best_idx]
        best_metric = col_errors[best_idx]

        best_method_per_col[col] = best_idx
        summary_list.append({
            'Column': col,
            'Data Type': col_type,
            'Best Method': best_method,
            'Metric': best_metric,
        })

    summary_table = pd.DataFrame(summary_list)
    
    best_imputed_df = original_df.copy()
    for cat in categorical_cols:
        if cat in best_imputed_df:
            best_imputed_df[cat] = best_imputed_df[cat].astype(object)

    for col in original_df.columns:
        if mask_df[col].sum() > 0 and best_method_per_col[col] is not None:
            method_idx = best_method_per_col[col]
            best_imputed_df.loc[mask_df[col], col] = \
                imputed_dfs[method_idx].loc[mask_df[col], col]

    errors = summary_table['Metric'].dropna().values
    aggregated_error = np.mean(errors) if len(errors) > 0 else np.nan

    return best_imputed_df, summary_table, aggregated_error


def optimize_imputation_hyperparams(imputation_func, 
                                    original_df, 
                                    df_missing, 
                                    mask_df, 
                                    continuous_cols, 
                                    discrete_cols, 
                                    categorical_cols, 
                                    timelimit=600,    # in seconds
                                    min_trials=20,
                                    random_seed=96):
    """
    Optimize hyperparameters for an imputation function using Optuna.

    This function takes the complete (original) DataFrame and a missing percentage.
    It uses `create_missings` to generate a DataFrame with simulated missing values and
    a corresponding mask. Then it runs the candidate imputation method on the incomplete
    DataFrame, evaluates the imputed results against the original DataFrame using the mask,
    and guides the hyperparameter search based on an aggregated error (lower is better).

    Parameters:
        imputation_func (callable): An imputation function (do_knn, do_mice, do_mf, or do_midas).
        original_df (pd.DataFrame): The complete ground-truth DataFrame.
        missing_percent (float): Percentage of missing values to simulate.
        continuous_cols (list): List of continuous numeric column names.
        discrete_cols (list): List of discrete numeric column names.
        categorical_cols (list): List of categorical column names.
        timelimit (int): Maximum time in seconds to run the optimization.
        min_trials (int): Minimum number of Optuna trials to run.
        random_seed (int): Seed for generating missingness (passed to create_missings).

    Returns:
        best_trial: The best trial object from the study.
        best_value: The best (lowest) aggregated objective value.
    """
    # Generate missing values and mask using the provided function.
    # _, df_missing, mask_df = create_missings(original_df, missingness=missing_percent, random_seed=random_seed)

    def objective(trial):
        func_name = imputation_func.__name__
        params = {}

        if func_name == "do_knn":
            params['n_neighbors'] = trial.suggest_int("n_neighbors", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            # Run imputation on df_missing, not the original complete data.
            imputed_df = imputation_func(df_missing, 
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols, 
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"KNN, n_neighbors={params['n_neighbors']}, scale={params['scale']}"
        elif func_name == "do_mice":
            params['iters'] = trial.suggest_int("iters", 5, 20)
            params['strat'] = trial.suggest_categorical("strat", ['normal', 'shap', 'fast'])
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MICE, iters={params['iters']}, strat={params['strat']}, scale={params['scale']}"
        elif func_name == "do_mf":
            params['iters'] = trial.suggest_int("iters", 3, 15)
            params['scale'] = trial.suggest_categorical("scale", [True, False])
            imputed_df = imputation_func(df_missing,
                                         continuous_cols=continuous_cols, 
                                         discrete_cols=discrete_cols, 
                                         categorical_cols=categorical_cols,
                                         **params)
            imputed_dfs = [imputed_df]
            method_info = f"MissForest, iters={params['iters']}, scale={params['scale']}"
        elif func_name == "do_midas":
            params['layer'] = trial.suggest_categorical("layer", [[256,256], [128,128], [512,256]])
            params['vae'] = trial.suggest_categorical("vae", [True, False])
            params['samples'] = trial.suggest_int("samples", 5, 20)
            imputed_dfs, method_info = imputation_func(df_missing,
                                                       continuous_cols=continuous_cols, 
                                                       discrete_cols=discrete_cols, 
                                                       categorical_cols=categorical_cols,
                                                       **params)
            imputed_dfs = [imputed_dfs[0]]
        else:
            raise ValueError(f"Unsupported imputation function: {func_name}")

        # Evaluate the imputed result by comparing against the original complete DataFrame.
        _, _, aggregated_error = select_best_imputations(
            imputed_dfs, original_df, mask_df, continuous_cols, discrete_cols, categorical_cols,
            method_info=method_info
        )

        if np.isnan(aggregated_error):
            aggregated_error = 1e6

        return aggregated_error

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timelimit, n_trials=min_trials)

    best_trial = study.best_trial
    best_value = best_trial.value

    print("Optimization completed!")
    print("Best Trial Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"Best Objective Value (aggregated error): {best_value}")

    return best_trial, best_value

In [20]:
def do_midas(df,
             continuous_cols=None,
             discrete_cols=None,
             categorical_cols=None,
             layer: list = [256, 256],
             vae: bool = True,
             samples: int = 10,
             random_seed: float = 96):
    """
    Imputes missing values using the MIDAS model.

    Parameters:
      df (pd.DataFrame): Input dataframe with NaNs in both numeric & categorical.
      continuous_cols (list): List of continuous column names.
      discrete_cols (list): List of discrete (numeric but not continuous) column names.
      categorical_cols (list): List of categorical column names.

    Returns:
      imps (list): A list of imputed dataframes, with original dtypes restored.
      method_info (str): Summary of MIDAS params used.
    """
    # 1. One‑hot encode the categoricals
    md_cat_data, md_cats = md.cat_conv(df[categorical_cols])

    # 2. Build the “wide” DF: drop raw cats, append one‑hots
    df_num = df.drop(columns=categorical_cols)
    data_in = pd.concat([df_num, md_cat_data], axis=1)

    # 3. Record & re‑insert the NaN locations so MIDAS sees them as missing
    na_mask = data_in.isnull()
    data_in[na_mask] = np.nan

    # 4. Scale only the numeric columns in place
    num_cols = discrete_cols + continuous_cols
    scaler = MinMaxScaler()
    data_in[num_cols] = scaler.fit_transform(data_in[num_cols])

    # 5. Build & train the MIDAS model
    imputer = md.Midas(
        layer_structure=layer,
        vae_layer=vae,
        seed=random_seed,
        input_drop=0.75
    )
    imputer.build_model(data_in, softmax_columns=md_cats)
    imputer.train_model(training_epochs=20)

    # 6. Generate multiple imputations
    raw_imps = imputer.generate_samples(m=samples).output_list

    # 7. Decode each imputed DF back to original structure
    flat_cats = [c for grp in md_cats for c in grp]
    imps = []

    for imp_df in raw_imps:
        # 7a. inverse‑scale numeric cols
        imp_df[num_cols] = scaler.inverse_transform(imp_df[num_cols])

        # 7b. decode one‑hots (before dropping them!)
        decoded = {}
        for i, grp in enumerate(md_cats):
            # just in case, only keep those actually present
            present = [c for c in grp if c in imp_df.columns]
            # idxmax → gives the dummy column name with highest prob
            decoded[categorical_cols[i]] = imp_df[present].idxmax(axis=1)

        cat_df = pd.DataFrame(decoded, index=imp_df.index)

        # 7c. now drop the dummy cols
        base = imp_df.drop(columns=flat_cats, errors='ignore')

        # 7d. concat in your decoded cat columns
        merged = pd.concat([base, cat_df], axis=1)

        # 7e. round discrete cols
        merged[discrete_cols] = merged[discrete_cols].round().astype(int)

        imps.append(merged)

    method_info = f"MIDAS, params: samples={samples}, layer={layer}, vae={vae}"
    return imps, method_info


def run_full_pipeline(df: pd.DataFrame, 
                      simulate:bool=False,               # True for simulated missingness, False for random missingness
                      missingness_value: float = 10.0,   # used only for random missingness (percent)
                      show_missingness: bool = False,
                      timelimit: int = 600, 
                      min_trials: int = 20, 
                      random_seed: int = 96):
    """
    Run the full pipeline to find the best hyperparameters for each imputation method.

    The pipeline performs these steps:
    
      1. Preprocesses the DataFrame using `prep`, which cleans the data,
         encodes categorical variables, and splits features into continuous,
         discrete, and categorical lists.
      2. Introduces missingness using either simulated missingness (reintroducing missingness 
         based on the original NaN proportions) or random missingness (dropping values randomly
         given a specified missing percentage).
      3. Runs hyperparameter optimization (via `optimize_imputation_hyperparams`) for each candidate 
         imputation method (e.g., do_knn, do_mice, do_mf, do_midas).
         
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        missing_type (str): "simulate" to simulate missingness using original missing proportions,
                            "random" to drop values randomly.
        missingness_value (float): Percentage of missingness (only used if missing_type == "random").
        show_missingness (bool): If True, prints missingness comparison when using simulate missingness.
        timelimit (int): Time limit (in seconds) for each hyperparameter optimization study.
        min_trials (int): Minimum number of trials for each study.
        random_seed (int): Random seed for reproducibility.

    Returns:
        dict: A dictionary where the keys are method names (strings) and the values are the best 
              hyperparameter dictionaries (from the best Optuna trial) for that method.
    """

    # Step 1: Preprocess Data
    continuous_cols, discrete_cols, categorical_cols, df_clean, encoders = prep(df)
    
    # Step 2: Create Missingness
    # Note: For simulation, the missing proportions are taken from the original df.
    if simulate: 
        # simulate_missingness returns: (complete_df, df_with_missing, missing_mask)
        original_complete, df_missing, mask_df = simulate_missingness(df, 
                                                                      show_missingness=show_missingness,
                                                                      random_state=random_seed)
    else:   
        original_complete, df_missing, mask_df = create_missings(df_clean, 
                                                                 missingness=missingness_value, 
                                                                 random_seed=random_seed)
        
    # The original_complete DataFrame is assumed to be ground truth for evaluation.
    # If missing_type=="simulate", original_complete is the complete-case subset from the original data.
    # For random, the df_clean (preprocessed and complete) is used and missingness is artificially introduced.

    # Step 3: Define candidate imputation methods to optimize.
    # It is assumed that these functions are defined: do_knn, do_mice, do_mf, do_midas.
    candidate_methods = {
        "KNN": do_knn,
        "MICE": do_mice,
        "MissForest": do_mf,
        "MIDAS": do_midas
    }
    
    best_hyperparams = {}
    
    # Optimize hyperparameters for each imputation method candidate.
    for method_name, imputation_func in candidate_methods.items():
        print(f"\nOptimizing hyperparameters for {method_name}...")
        try:
            best_trial, best_value = optimize_imputation_hyperparams(
                imputation_func=imputation_func,
                original_df=original_complete,
                df_missing=df_missing,
                mask_df=mask_df,
                continuous_cols=continuous_cols,
                discrete_cols=discrete_cols,
                categorical_cols=categorical_cols,
                timelimit=timelimit,
                min_trials=min_trials,
                random_seed=random_seed
            )
            best_hyperparams[method_name] = best_trial.params
        except Exception as e:
            print(f"An error occurred while optimizing {method_name}: {e}")
            best_hyperparams[method_name] = None
    
    return best_hyperparams


In [21]:
best_params = run_full_pipeline(new_df,timelimit=60,random_seed=96)

[I 2025-04-17 02:57:14,415] A new study created in memory with name: no-name-969bd64b-c97c-4495-8f64-b81de3088888
[I 2025-04-17 02:57:14,533] Trial 0 finished with value: 131131.6001578852 and parameters: {'n_neighbors': 15, 'scale': False}. Best is trial 0 with value: 131131.6001578852.



Optimizing hyperparameters for KNN...


[I 2025-04-17 02:57:14,682] Trial 1 finished with value: 131029.3234563309 and parameters: {'n_neighbors': 9, 'scale': False}. Best is trial 1 with value: 131029.3234563309.
[I 2025-04-17 02:57:14,816] Trial 2 finished with value: 131074.15471079925 and parameters: {'n_neighbors': 6, 'scale': False}. Best is trial 1 with value: 131029.3234563309.
[I 2025-04-17 02:57:14,948] Trial 3 finished with value: 125217.44248023287 and parameters: {'n_neighbors': 7, 'scale': True}. Best is trial 3 with value: 125217.44248023287.
[I 2025-04-17 02:57:15,067] Trial 4 finished with value: 130164.61403298599 and parameters: {'n_neighbors': 8, 'scale': False}. Best is trial 3 with value: 125217.44248023287.
[I 2025-04-17 02:57:15,182] Trial 5 finished with value: 130465.42082056598 and parameters: {'n_neighbors': 13, 'scale': False}. Best is trial 3 with value: 125217.44248023287.
[I 2025-04-17 02:57:15,300] Trial 6 finished with value: 130557.66283120723 and parameters: {'n_neighbors': 14, 'scale': Fa

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 7
  scale: True
Best Objective Value (aggregated error): 125217.44248023287

Optimizing hyperparameters for MICE...


  bachelor_preds = bachelor_preds.astype(_PRE_LINK_DATATYPE)
  candidate_preds = candidate_preds.astype(_PRE_LINK_DATATYPE)  # type: ignore
[W 2025-04-17 02:57:18,066] Trial 0 failed with parameters: {'iters': 13, 'strat': 'shap', 'scale': True} because of the following error: ValueError('data must be finite, check for nan or inf values').
Traceback (most recent call last):
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_38656\3942454051.py", line 216, in objective
    imputed_df = imputation_func(df_missing,
  File "C:\Users\Matin\AppData\Local\Temp\ipykernel_38656\746599116.py", line 31, in do_mice
    kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\miceforest\imputation_kernel.py", line 1186, in mice
    imputati

An error occurred while optimizing MICE: data must be finite, check for nan or inf values

Optimizing hyperparameters for MissForest...


[WinError 2] The system cannot find the file specified
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
100%|██████████| 4/4 [00:20<00:00,  5.10s/it]
100%|██████████| 4/4 [00:01<00:00,  2.72it/s]
[I 2025-04-17 02:57:40,290] Trial 0 finished with value: 133577.8825688047 and parameters: {'iters': 4, 'scale': True}. Best is trial 0 with value: 133577.8825688047.
10

Optimization completed!
Best Trial Hyperparameters:
  iters: 14
  scale: True
Best Objective Value (aggregated error): 131308.20920312364

Optimizing hyperparameters for MIDAS...
Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 17.09939373864068
Epoch: 1 , loss: 15.89198440975613
Epoch: 2 , loss: 15.03160466088189
Epoch: 3 , loss: 14.103373103671604
Epoch: 4 , loss: 13.141355302598742
Epoch: 5 , loss: 12.021252473195394
Epoch: 6 , loss: 10.784110228220621
Epoch: 7 , loss: 9.804625352223715
Epoch: 8 , loss: 9.114099661509195
Epoch: 9 , loss: 8.790817101796469
Epoch: 10 , loss: 8.354380819532606
Epoch: 11 , loss: 8.267691956626045
Epoch: 12 , loss: 8.033540566762289
Epoch: 13 , loss: 8.008234659830729
Epoch: 14 , loss: 7.8632139629787865
Epoch: 15 , loss: 7.817143652174208
Epoch: 16 , loss: 7.667036745283339
Epoch: 17 , loss: 7.737847646077474
Epoch: 18 , loss: 7.592597617043389
Epoch: 19 , loss: 7.582941638098823
Training complete. Saving file...
M

[I 2025-04-17 02:59:17,365] Trial 0 finished with value: 153066.86984544783 and parameters: {'layer': [128, 128], 'vae': False, 'samples': 18}. Best is trial 0 with value: 153066.86984544783.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 17.09940348731147
Epoch: 1 , loss: 15.892036596934
Epoch: 2 , loss: 15.03161350886027
Epoch: 3 , loss: 14.103606383005777
Epoch: 4 , loss: 13.141473611195883
Epoch: 5 , loss: 12.021337774064806
Epoch: 6 , loss: 10.784339480929905
Epoch: 7 , loss: 9.804694599575466
Epoch: 8 , loss: 9.11409616470337
Epoch: 9 , loss: 8.790806982252333
Epoch: 10 , loss: 8.354431867599487
Epoch: 11 , loss: 8.267699506547716
Epoch: 12 , loss: 8.03358358807034
Epoch: 13 , loss: 8.008200142118666
Epoch: 14 , loss: 7.863242864608765
Epoch: 15 , loss: 7.817123174667358
Epoch: 16 , loss: 7.667099131478204
Epoch: 17 , loss: 7.7378054989708795
Epoch: 18 , loss: 7.592677487267388
Epoch: 19 , loss: 7.582981454001533
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 02:59:29,784] Trial 1 finished with value: 153064.00260203923 and parameters: {'layer': [128, 128], 'vae': False, 'samples': 15}. Best is trial 1 with value: 153064.00260203923.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 15.636044979095459
Epoch: 1 , loss: 12.337956216600206
Epoch: 2 , loss: 9.152097278171116
Epoch: 3 , loss: 8.159153832329643
Epoch: 4 , loss: 7.995209879345364
Epoch: 5 , loss: 7.821314281887478
Epoch: 6 , loss: 7.578500562243992
Epoch: 7 , loss: 7.499377727508545
Epoch: 8 , loss: 7.326302263471815
Epoch: 9 , loss: 7.373110347323948
Epoch: 10 , loss: 7.259395254982842
Epoch: 11 , loss: 7.191484822167291
Epoch: 12 , loss: 7.1857996516757545
Epoch: 13 , loss: 7.138086981243557
Epoch: 14 , loss: 7.124410152435303
Epoch: 15 , loss: 7.08532452583313
Epoch: 16 , loss: 7.071972052256267
Epoch: 17 , loss: 7.102564917670356
Epoch: 18 , loss: 7.054099639256795
Epoch: 19 , loss: 6.9482744799719915
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 02:59:43,731] Trial 2 finished with value: 135966.18949143495 and parameters: {'layer': [512, 256], 'vae': False, 'samples': 19}. Best is trial 2 with value: 135966.18949143495.


Size index: [41, 2]
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.





Computation graph constructed

Model initialised

Epoch: 0 , loss: 19.757271342807346
Epoch: 1 , loss: 18.933769543965656
Epoch: 2 , loss: 18.220343907674152
Epoch: 3 , loss: 17.564837879604763
Epoch: 4 , loss: 16.581482516394722
Epoch: 5 , loss: 15.225187884436714
Epoch: 6 , loss: 14.058073997497559
Epoch: 7 , loss: 12.559790134429932
Epoch: 8 , loss: 10.63216914070977
Epoch: 9 , loss: 9.486226929558647
Epoch: 10 , loss: 8.934253268771702
Epoch: 11 , loss: 8.783985959159004
Epoch: 12 , loss: 8.473406394322714
Epoch: 13 , loss: 8.297548744413588
Epoch: 14 , loss: 8.247168911827934
Epoch: 15 , loss: 7.972810771730211
Epoch: 16 , loss: 7.967378642823961
Epoch: 17 , loss: 7.908386256959703
Epoch: 18 , loss: 7.903189049826728
Epoch: 19 , loss: 7.577220439910889
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 02:59:57,610] Trial 3 finished with value: 157856.84558781178 and parameters: {'layer': [256, 256], 'vae': True, 'samples': 8}. Best is trial 2 with value: 135966.18949143495.


Size index: [41, 2]

Computation graph constructed

Model initialised

Epoch: 0 , loss: 15.636044979095459
Epoch: 1 , loss: 12.337956216600206
Epoch: 2 , loss: 9.152117093404135
Epoch: 3 , loss: 8.15916763411628
Epoch: 4 , loss: 7.9952109919654
Epoch: 5 , loss: 7.821316374672784
Epoch: 6 , loss: 7.5784858862559
Epoch: 7 , loss: 7.4993639257219105
Epoch: 8 , loss: 7.32630467414856
Epoch: 9 , loss: 7.373118082682292
Epoch: 10 , loss: 7.2594194412231445
Epoch: 11 , loss: 7.191487815645006
Epoch: 12 , loss: 7.18579543961419
Epoch: 13 , loss: 7.138092279434204
Epoch: 14 , loss: 7.124441782633464
Epoch: 15 , loss: 7.08531994289822
Epoch: 16 , loss: 7.072125487857395
Epoch: 17 , loss: 7.102713982264201
Epoch: 18 , loss: 7.054127640194363
Epoch: 19 , loss: 6.94827405611674
Training complete. Saving file...
Model saved in file: tmp/MIDAS
INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


[I 2025-04-17 03:00:11,366] Trial 4 finished with value: 135966.2376930405 and parameters: {'layer': [512, 256], 'vae': False, 'samples': 15}. Best is trial 2 with value: 135966.18949143495.


Optimization completed!
Best Trial Hyperparameters:
  layer: [512, 256]
  vae: False
  samples: 19
Best Objective Value (aggregated error): 135966.18949143495


In [22]:
best_params///

SyntaxError: invalid syntax (4226623454.py, line 1)

=====

In [None]:
from impute2 import *
import pandas as pd

new_df = pd.read_excel(r"C:\Users\Matin\Downloads\Data for Dr.Matin.xlsx", 's1')
new_df.drop(['n', 'ID','Gen.code'],axis=1,inplace=True)
# new_df = new_df[:700]
best, table = run_full_pipeline(new_df, simulate=False, missingness_value=23, timelimit=60, random_seed=96)


[I 2025-04-17 02:32:49,853] A new study created in memory with name: no-name-8e306423-1a8b-47fa-a73e-e45c841ff5f9



Optimizing hyperparameters for KNN...


[I 2025-04-17 02:32:51,046] Trial 0 finished with value: 5.412117090289622 and parameters: {'n_neighbors': 4, 'scale': True, 'wei': 'uniform'}. Best is trial 0 with value: 5.412117090289622.
[I 2025-04-17 02:32:52,200] Trial 1 finished with value: 5.588127724315016 and parameters: {'n_neighbors': 7, 'scale': False, 'wei': 'uniform'}. Best is trial 0 with value: 5.412117090289622.
[I 2025-04-17 02:32:53,366] Trial 2 finished with value: 5.351060168064008 and parameters: {'n_neighbors': 5, 'scale': True, 'wei': 'uniform'}. Best is trial 2 with value: 5.351060168064008.
[I 2025-04-17 02:32:54,586] Trial 3 finished with value: 5.5436973334476685 and parameters: {'n_neighbors': 8, 'scale': False, 'wei': 'distance'}. Best is trial 2 with value: 5.351060168064008.
[I 2025-04-17 02:32:55,784] Trial 4 finished with value: 5.278361991712946 and parameters: {'n_neighbors': 7, 'scale': True, 'wei': 'distance'}. Best is trial 4 with value: 5.278361991712946.
[I 2025-04-17 02:32:56,992] Trial 5 fini

Optimization completed!
Best Trial Hyperparameters:
  n_neighbors: 15
  scale: True
  wei: distance
Best Objective Value (aggregated error): 5.153632864620103
Best hyperparameters for KNN: {'n_neighbors': 15, 'scale': True, 'wei': 'distance'} with best agg error of 5.153632864620103

Optimizing hyperparameters for MissForest...


[WinError 2] The system cannot find the file specified
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
  0%|          | 0/7 [00:10<?, ?it/s]
[W 2025-04-17 02:33:24,210] Trial 0 failed with parameters: {'iters': 7, 'scale': True} because of the following error: ValueError('Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects dis

An error occurred while optimizing MissForest: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

Optimizing hyperparameters for MICE...


[W 2025-04-17 02:33:30,107] Trial 0 failed with parameters: {'iters': 12, 'strat': 'fast', 'scale': True} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "d:\Prog\AutoStats\impute2.py", line 566, in objective
    imputed_df = imputation_func(df_missing, random_seed=random_seed,
  File "d:\Prog\AutoStats\impute2.py", line 201, in do_mice
    kernel.mice(iterations=iters, verbose=False)  # Disable verbose output
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\miceforest\imputation_kernel.py", line 1170, in mice
    current_model = train(
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python310\lib\site-packages\lightgbm\engine.py", line 307, in train
    booster.update(fobj=fobj)
  File "c:\Users\Matin\AppData\Local\Programs\Python\Python3

KeyboardInterrupt: 