In [35]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import MIDASpy
from sklearn.preprocessing import MinMaxScaler
# from missingpy import MissForest
# from tensorflow.keras.models import Model, Input, Dense
# from tensorflow.keras.optimizers import Adam

In [52]:
df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')

def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
      - Dropping rows with missing values and resetting the index.
      - Converting object columns to categorical via LabelEncoder.
      - Converting other columns to float (and then to int if >50% of values are integer-like).
      - If any numeric column (not already marked as categorical) has only 2 unique values,
        it is considered categorical and encoded.
    
    Returns:
      categorical_cols (list): List of columns encoded as categorical.
      discrete_cols (list): List of columns that are numeric and integer-like.
      cont_cols (list): List of remaining continuous numeric columns.
      df_clean (DataFrame): The preprocessed DataFrame.
      encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values and reset the index.
    df_clean = df.dropna().reset_index(drop=True)

    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Process each column.
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Mark as categorical and encode using LabelEncoder.
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float.
                df_clean[col] = df_clean[col].astype(float)
                # If >50% of values are integer-like, cast column to int.
                if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # Additionally, if any numeric column (not already marked as categorical) has only 2 unique values,
    # treat it as categorical and encode it.
    for col in df_clean.columns:
        if col not in categorical_cols and df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le

    # Continuous columns are those not marked as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.

    Parameters:
      df (pd.DataFrame): DataFrame with encoded categorical columns.
      encoders (dict): Dictionary mapping column names to their LabelEncoder.

    Returns:
      pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        # Ensure that the column is integer type before inverse transforming.
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

df.drop('نامونامخانوادگی', axis=1,inplace=True)
cont, dis, cat, df2, enc = prep(df)


In [53]:
cat

['side']

In [54]:
cont

['flv1',
 'glv1',
 'supwholevd1',
 'supvdsup1',
 'supvdinf1',
 'supvdfo1',
 'supvdpa1',
 'supvdsup1_A',
 'supvdinf1_A',
 'supvdtem1',
 'supvdsup1_B',
 'supvdnasal1',
 'supvdinfe1',
 'supvdperi1',
 'supvdsup1_C',
 'supvdinf1_B',
 'supvdtempo1',
 'supvdsup1_D',
 'supvdnasal1_A',
 'supvdinf1_C',
 'deepvdwh1',
 'deepvdsup1',
 'deepvdinf1',
 'deepvdfo1',
 'deepvdpara1',
 'deepvdsup1_A',
 'deepvdinf1_A',
 'deepvdtempo1',
 'deepvdsupO1',
 'deepvdnasal1',
 'deepvdinf1_B',
 'deepvdperi1',
 'deepvdsup1_B',
 'deepvdinf1_C',
 'deepvdt1',
 'deepvdsup1_C',
 'deepvdnasal1_A',
 'deepvdinf1_D',
 'SSI1',
 'FAZ1',
 'PERIM1',
 'FD1',
 'flv1_injured',
 'glv1_injured',
 'supwholevd1_injured',
 'supvdsup1_injured',
 'supvdinf1_injured',
 'supvdfo1_injured',
 'supvdpa1_injured',
 'supvdsup1_injured_A',
 'supvdinf1_injured_A',
 'supvdtem1_injured',
 'supvdsup1_injured_B',
 'supvdnasal1_injured',
 'supvdinfe1_injured',
 'supvdperi1_injured',
 'supvdsup1_injured_C',
 'supvdinf1_injured_B',
 'supvdtempo1_injured'

In [43]:
def create_missings(df:pd.DataFrame, missingness:float, random_seed:float=96):
    # Create random missingness.
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df_missing, mask_df


In [57]:
def calculate_metrics(df_clean, mask_df, imputed_df, method_name, continuous_cols, discrete_cols, categorical_cols):    
    mae_list = []
    for col in continuous_cols + discrete_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            error = np.abs(imputed_df.loc[col_mask, col] - df_clean.loc[col_mask, col])
            mae_list.append(error.mean())
    overall_mae = np.mean(mae_list) if mae_list else np.nan
    
    acc_list = []
    for col in categorical_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            acc = (imputed_df.loc[col_mask, col] == df_clean.loc[col_mask, col]).mean()
            acc_list.append(acc)
    overall_acc = np.mean(acc_list) if acc_list else np.nan

    return {'Method': method_name, 'MAE': overall_mae, 'Accuracy': overall_acc}

In [46]:
df3, df3_mask = create_missings(df2,20)
df3

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1.0,98.0,96.0,100.0,-4.0,,,278.0,280.0,,...,41.7,,0.361,,48.28,282.0,266.0,256.0,253.0,
1,,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,1.0,101.0,102.0,,2.0,0.27,0.45,284.0,286.0,,...,28.6,0.7,0.262,2.045,54.68,185.0,,194.0,179.0,170.0
3,,90.0,88.0,92.0,-4.0,,,274.0,,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,
4,0.0,105.0,105.0,,1.0,,0.84,,288.0,295.0,...,2.9,0.1,0.304,,3.86,285.0,315.0,341.0,307.0,294.0
5,1.0,106.0,103.0,,-5.0,0.01,0.1,,310.0,,...,33.6,,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,0.0,108.0,107.0,109.0,-2.0,0.23,,285.0,288.0,283.0,...,49.5,,,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,0.0,92.0,92.0,,0.0,0.26,3.35,281.0,284.0,279.0,...,35.5,,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,1.0,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,1.0,95.0,,,-2.0,,,282.0,,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,,339.0


In [49]:

def knn_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5):
    """
    Impute missing values in a DataFrame using KNN imputation.
    
    Assumes:
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.
      - Categorical columns have been label encoded using sklearn's LabelEncoder 
        (with missing values represented as np.nan).
    
    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      n_neighbors (int): Number of neighbors for KNN imputation.
    
    Returns:
      pd.DataFrame: New DataFrame with imputed values.
    """
    df_imputed = df.copy()
    
    # Impute continuous columns
    if continuous_cols:
        imputer_cont = KNNImputer(n_neighbors=n_neighbors)
        df_imputed[continuous_cols] = imputer_cont.fit_transform(df_imputed[continuous_cols])
    
    # Impute discrete columns and round to integer
    if discrete_cols:
        imputer_disc = KNNImputer(n_neighbors=n_neighbors)
        imputed_disc = imputer_disc.fit_transform(df_imputed[discrete_cols])
        df_imputed[discrete_cols] = np.round(imputed_disc).astype(int)
    
    # Impute categorical columns (assumed to be label encoded)
    if categorical_cols:
        imputer_cat = KNNImputer(n_neighbors=n_neighbors)
        imputed_cat = imputer_cat.fit_transform(df_imputed[categorical_cols])
        df_imputed[categorical_cols] = np.round(imputed_cat).astype(int)
    
    return df_imputed

knn_imputed = knn_impute_df(df3, continuous_cols=cont, discrete_cols=dis, categorical_cols=cat)
knn_imputed

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,0.26,1.29,278,280,286,...,41.7,0.68,0.361,2.1812,48.28,282,266,256,253,241
1,0,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,106,2,0.27,0.45,284,286,284,...,28.6,0.7,0.262,2.045,54.68,185,214,194,179,170
3,0,90,88,92,-4,0.232,4.484,274,283,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,283
4,0,105,105,99,1,0.378,0.84,288,288,295,...,2.9,0.1,0.304,4.3108,3.86,285,315,341,307,294
5,1,106,103,109,-5,0.01,0.1,291,310,294,...,33.6,0.7,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.34,285,288,283,...,49.5,0.68,0.2652,2.029,52.78,176,222,241,205,167
7,0,92,92,106,0,0.26,3.35,281,284,279,...,35.5,0.72,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,288,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,98,99,-2,0.268,1.228,282,288,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,324,339


In [58]:
calculate_metrics(df2, df3_mask, knn_imputed,'knn_5n',cont, dis, cat)

{'Method': 'knn_5n', 'MAE': 10.462393108622457, 'Accuracy': 0.375}

In [None]:

def evaluate_imputation(df, percentage, random_state=96, epochs=50):
    # Drop rows with any NaN values and reset the index.
    df_clean = df.dropna().reset_index(drop=True)
    
    # List to record categorical columns.
    categorical_cols = []
    
    # Convert columns to appropriate types.
    for col in df_clean.columns:
        # If the column dtype is object, mark as categorical and convert to integer codes.
        if df_clean[col].dtype == 'object':
            categorical_cols.append(col)
            df_clean[col] = pd.Categorical(df_clean[col]).codes
        else:
            try:
                # Try to convert column to float.
                df_clean[col] = df_clean[col].astype(float)
                # If more than 50% of values are integer-like, cast column to int.
                if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
                    df_clean[col] = df_clean[col].astype(int)
            except (ValueError, TypeError):
                categorical_cols.append(col)
                df_clean[col] = pd.Categorical(df_clean[col]).codes

    # Additionally, if any numeric column has only 2 unique values, mark it as categorical.
    for col in df_clean.columns:
        if col not in categorical_cols and df_clean[col].nunique() == 2:
            categorical_cols.append(col)
    
    # Define numeric columns as those not in the categorical list.
    numeric_cols = [col for col in df_clean.columns if col not in categorical_cols]
    
    # Create random missingness.
    np.random.seed(random_state)
    mask = np.random.rand(*df_clean.shape) < (percentage / 100)
    mask_df = pd.DataFrame(mask, columns=df_clean.columns)
    df_missing = df_clean.mask(mask)
    
    results = []
    
    # Helper function to calculate metrics.
    # For numeric columns, compute MAE; for categorical columns, compute accuracy.
    def calculate_metrics(imputed_df, method_name):
        mae_list = []
        for col in numeric_cols:
            col_mask = mask_df[col]
            if col_mask.sum() > 0:
                error = np.abs(imputed_df.loc[col_mask, col] - df_clean.loc[col_mask, col])
                mae_list.append(error.mean())
        overall_mae = np.mean(mae_list) if mae_list else np.nan
        
        acc_list = []
        for col in categorical_cols:
            col_mask = mask_df[col]
            if col_mask.sum() > 0:
                acc = (imputed_df.loc[col_mask, col] == df_clean.loc[col_mask, col]).mean()
                acc_list.append(acc)
        overall_acc = np.mean(acc_list) if acc_list else np.nan
        
        return {'Method': method_name, 'MAE': overall_mae, 'Accuracy': overall_acc}
    
    # --- KNN Imputation ---
    # KNNImputer works only on numeric data.
    knn_imputer = KNNImputer()
    # Impute numeric columns.
    df_knn_numeric = pd.DataFrame(knn_imputer.fit_transform(df_missing[numeric_cols]), 
                                  columns=numeric_cols)
    # For categorical columns, impute missing values with the mode.
    df_knn_cat = df_missing[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]) if not col.empty else col)
    # Combine numeric and categorical results.
    df_knn = pd.concat([df_knn_numeric, df_knn_cat], axis=1)[df_clean.columns]
    results.append(calculate_metrics(df_knn, 'KNN'))
    
    # --- MICE Imputation (IterativeImputer) ---
    mice_imputer = IterativeImputer(random_state=random_state)
    df_mice_numeric = pd.DataFrame(mice_imputer.fit_transform(df_missing[numeric_cols]), 
                                   columns=numeric_cols)
    df_mice_cat = df_missing[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]) if not col.empty else col)
    df_mice = pd.concat([df_mice_numeric, df_mice_cat], axis=1)[df_clean.columns]
    results.append(calculate_metrics(df_mice, 'MICE'))
    
    # # --- MICE Forest ---
    # # Pass the categorical columns list to the imputer.
    # kernel = mf.ImputationKernel(df_missing, random_state=random_state, categorical_features=categorical_cols)
    # kernel.mice(2)
    # df_miceforest = kernel.complete_data(0)
    # results.append(calculate_metrics(df_miceforest, 'MICE Forest'))
    
    # --- Miss Forest ---
    # MissForest requires all columns to be int, float, or bool.
    # Now that categorical columns have been encoded to int, this requirement is met.
    mf_imputer = MissForest(categorical=categorical_cols)
    df_missforest = pd.DataFrame(mf_imputer.fit_transform(df_missing), columns=df_clean.columns)
    results.append(calculate_metrics(df_missforest, 'Miss Forest'))
    
    # --- MIDASpy ---
    data_cat,cat_cols = MIDASpy.cat_conv(df_missing[categorical_cols])
    df_no_cats = df_missing.drop(categorical_cols, axis=1)
    df_in = pd.concat([df_no_cats,data_cat])
    na_loc = df_in.isnull()
    df_in[na_loc] = np.nan


    midas_imputer = MIDASpy.Midas(vae_layer=True,seed=96)
    midas_imputer.build_model(df_in, softmax_columns=cat_cols)
    midas_imputer.train_model(epochs=epochs)
    df_midas = pd.DataFrame(midas_imputer.impute(df_missing.values), columns=df_clean.columns)
    results.append(calculate_metrics(df_midas, 'MIDASpy'))
    
    return pd.DataFrame(results)
 