In [35]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import IterativeImputer
import miceforest as mf
from missforest import MissForest
import MIDASpy
from sklearn.preprocessing import MinMaxScaler
# from missingpy import MissForest
# from tensorflow.keras.models import Model, Input, Dense
# from tensorflow.keras.optimizers import Adam

In [52]:
df = pd.read_excel(r"F:\Work stuff\Opthalmology\berlin\raw.xlsx", 'raw')

def prep(df: pd.DataFrame):
    """
    Preprocess the DataFrame by:
      - Dropping rows with missing values and resetting the index.
      - Converting object columns to categorical via LabelEncoder.
      - Converting other columns to float (and then to int if >50% of values are integer-like).
      - If any numeric column (not already marked as categorical) has only 2 unique values,
        it is considered categorical and encoded.
    
    Returns:
      categorical_cols (list): List of columns encoded as categorical.
      discrete_cols (list): List of columns that are numeric and integer-like.
      cont_cols (list): List of remaining continuous numeric columns.
      df_clean (DataFrame): The preprocessed DataFrame.
      encoders (dict): Mapping from categorical column name to its LabelEncoder.
    """
    # Drop rows with missing values and reset the index.
    df_clean = df.dropna().reset_index(drop=True)

    categorical_cols = []
    discrete_cols = []
    encoders = {}

    # Process each column.
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Mark as categorical and encode using LabelEncoder.
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le
        else:
            try:
                # Convert column to float.
                df_clean[col] = df_clean[col].astype(float)
                # If >50% of values are integer-like, cast column to int.
                if (np.isclose(df_clean[col] % 1, 0).mean() > 0.5):
                    df_clean[col] = df_clean[col].astype(int)
                    discrete_cols.append(col)
            except (ValueError, TypeError):
                # If conversion fails, treat the column as categorical.
                categorical_cols.append(col)
                le = LabelEncoder()
                df_clean[col] = le.fit_transform(df_clean[col])
                encoders[col] = le

    # Additionally, if any numeric column (not already marked as categorical) has only 2 unique values,
    # treat it as categorical and encode it.
    for col in df_clean.columns:
        if col not in categorical_cols and df_clean[col].nunique() == 2:
            categorical_cols.append(col)
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col])
            encoders[col] = le

    # Continuous columns are those not marked as categorical or discrete.
    continuous_cols = [col for col in df_clean.columns if col not in categorical_cols + discrete_cols]

    return continuous_cols, discrete_cols, categorical_cols, df_clean, encoders

def reverse_encoding(df: pd.DataFrame, encoders: dict):
    """
    Reverse the LabelEncoder transformation on categorical columns.

    Parameters:
      df (pd.DataFrame): DataFrame with encoded categorical columns.
      encoders (dict): Dictionary mapping column names to their LabelEncoder.

    Returns:
      pd.DataFrame: A new DataFrame with the categorical columns decoded to their original labels.
    """
    df_decoded = df.copy()
    for col, le in encoders.items():
        # Ensure that the column is integer type before inverse transforming.
        df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))
    return df_decoded

df.drop('نامونامخانوادگی', axis=1,inplace=True)
cont, dis, cat, df2, enc = prep(df)


In [53]:
cat

['side']

In [54]:
cont

['flv1',
 'glv1',
 'supwholevd1',
 'supvdsup1',
 'supvdinf1',
 'supvdfo1',
 'supvdpa1',
 'supvdsup1_A',
 'supvdinf1_A',
 'supvdtem1',
 'supvdsup1_B',
 'supvdnasal1',
 'supvdinfe1',
 'supvdperi1',
 'supvdsup1_C',
 'supvdinf1_B',
 'supvdtempo1',
 'supvdsup1_D',
 'supvdnasal1_A',
 'supvdinf1_C',
 'deepvdwh1',
 'deepvdsup1',
 'deepvdinf1',
 'deepvdfo1',
 'deepvdpara1',
 'deepvdsup1_A',
 'deepvdinf1_A',
 'deepvdtempo1',
 'deepvdsupO1',
 'deepvdnasal1',
 'deepvdinf1_B',
 'deepvdperi1',
 'deepvdsup1_B',
 'deepvdinf1_C',
 'deepvdt1',
 'deepvdsup1_C',
 'deepvdnasal1_A',
 'deepvdinf1_D',
 'SSI1',
 'FAZ1',
 'PERIM1',
 'FD1',
 'flv1_injured',
 'glv1_injured',
 'supwholevd1_injured',
 'supvdsup1_injured',
 'supvdinf1_injured',
 'supvdfo1_injured',
 'supvdpa1_injured',
 'supvdsup1_injured_A',
 'supvdinf1_injured_A',
 'supvdtem1_injured',
 'supvdsup1_injured_B',
 'supvdnasal1_injured',
 'supvdinfe1_injured',
 'supvdperi1_injured',
 'supvdsup1_injured_C',
 'supvdinf1_injured_B',
 'supvdtempo1_injured'

In [43]:
def create_missings(df:pd.DataFrame, missingness:float, random_seed:float=96):
    # Create random missingness.
    np.random.seed(random_seed)
    mask = np.random.rand(*df.shape) < (missingness / 100)
    mask_df = pd.DataFrame(mask, columns=df.columns)
    df_missing = df.mask(mask)
    return df_missing, mask_df


In [57]:
def calculate_metrics(df_clean, mask_df, imputed_df, method_name, continuous_cols, discrete_cols, categorical_cols):    
    mae_list = []
    for col in continuous_cols + discrete_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            error = np.abs(imputed_df.loc[col_mask, col] - df_clean.loc[col_mask, col])
            mae_list.append(error.mean())
    overall_mae = np.mean(mae_list) if mae_list else np.nan
    
    acc_list = []
    for col in categorical_cols:
        col_mask = mask_df[col]
        if col_mask.sum() > 0:
            acc = (imputed_df.loc[col_mask, col] == df_clean.loc[col_mask, col]).mean()
            acc_list.append(acc)
    overall_acc = np.mean(acc_list) if acc_list else np.nan

    return {'Method': method_name, 'MAE': overall_mae, 'Accuracy': overall_acc}

In [46]:
df3, df3_mask = create_missings(df2,20)
df3

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1.0,98.0,96.0,100.0,-4.0,,,278.0,280.0,,...,41.7,,0.361,,48.28,282.0,266.0,256.0,253.0,
1,,105.0,104.0,106.0,-2.0,0.04,0.07,314.0,314.0,315.0,...,43.2,0.6,0.26,1.911,54.63,389.0,392.0,416.0,395.0,386.0
2,1.0,101.0,102.0,,2.0,0.27,0.45,284.0,286.0,,...,28.6,0.7,0.262,2.045,54.68,185.0,,194.0,179.0,170.0
3,,90.0,88.0,92.0,-4.0,,,274.0,,274.0,...,39.9,0.3,1.044,4.173,29.61,247.0,272.0,285.0,277.0,
4,0.0,105.0,105.0,,1.0,,0.84,,288.0,295.0,...,2.9,0.1,0.304,,3.86,285.0,315.0,341.0,307.0,294.0
5,1.0,106.0,103.0,,-5.0,0.01,0.1,,310.0,,...,33.6,,0.244,1.92,49.63,204.0,210.0,225.0,208.0,193.0
6,0.0,108.0,107.0,109.0,-2.0,0.23,,285.0,288.0,283.0,...,49.5,,,2.029,52.78,176.0,222.0,241.0,205.0,167.0
7,0.0,92.0,92.0,,0.0,0.26,3.35,281.0,284.0,279.0,...,35.5,,0.148,1.478,27.84,185.0,217.0,233.0,193.0,159.0
8,1.0,103.0,100.0,106.0,-6.0,0.05,0.29,294.0,294.0,,...,50.1,0.4,0.151,1.468,58.54,182.0,198.0,207.0,173.0,153.0
9,1.0,95.0,,,-2.0,,,282.0,,277.0,...,57.6,0.8,0.374,2.312,59.15,336.0,352.0,386.0,,339.0


In [49]:

def knn_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, n_neighbors=5):
    """
    Impute missing values in a DataFrame using KNN imputation.
    
    Assumes:
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.
      - Categorical columns have been label encoded using sklearn's LabelEncoder 
        (with missing values represented as np.nan).
    
    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      n_neighbors (int): Number of neighbors for KNN imputation.
    
    Returns:
      pd.DataFrame: New DataFrame with imputed values.
    """
    df_imputed = df.copy()
    
    # Impute continuous columns
    if continuous_cols:
        imputer_cont = KNNImputer(n_neighbors=n_neighbors)
        df_imputed[continuous_cols] = imputer_cont.fit_transform(df_imputed[continuous_cols])
    
    # Impute discrete columns and round to integer
    if discrete_cols:
        imputer_disc = KNNImputer(n_neighbors=n_neighbors)
        imputed_disc = imputer_disc.fit_transform(df_imputed[discrete_cols])
        df_imputed[discrete_cols] = np.round(imputed_disc).astype(int)
    
    # Impute categorical columns (assumed to be label encoded)
    if categorical_cols:
        imputer_cat = KNNImputer(n_neighbors=n_neighbors)
        imputed_cat = imputer_cat.fit_transform(df_imputed[categorical_cols])
        df_imputed[categorical_cols] = np.round(imputed_cat).astype(int)
    
    return df_imputed

knn_imputed = knn_impute_df(df3, continuous_cols=cont, discrete_cols=dis, categorical_cols=cat)
knn_imputed

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,0.26,1.29,278,280,286,...,41.7,0.68,0.361,2.1812,48.28,282,266,256,253,241
1,0,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,106,2,0.27,0.45,284,286,284,...,28.6,0.7,0.262,2.045,54.68,185,214,194,179,170
3,0,90,88,92,-4,0.232,4.484,274,283,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,283
4,0,105,105,99,1,0.378,0.84,288,288,295,...,2.9,0.1,0.304,4.3108,3.86,285,315,341,307,294
5,1,106,103,109,-5,0.01,0.1,291,310,294,...,33.6,0.7,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.34,285,288,283,...,49.5,0.68,0.2652,2.029,52.78,176,222,241,205,167
7,0,92,92,106,0,0.26,3.35,281,284,279,...,35.5,0.72,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,288,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,98,99,-2,0.268,1.228,282,288,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,324,339


In [58]:
calculate_metrics(df2, df3_mask, knn_imputed,'knn_5n',cont, dis, cat)

{'Method': 'knn_5n', 'MAE': 10.462393108622457, 'Accuracy': 0.375}

In [None]:
def mice_forest_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=10, strat='normal'):
    """
    Impute missing values in a DataFrame using the MICE forest method.
    
    Assumes:
      - Continuous columns are numeric.
      - Discrete columns are numeric and integer-like.
      - Categorical columns have been label encoded using sklearn's LabelEncoder 
        (with missing values represented as np.nan).
      
    Note:
      - The parameter n_neighbors is repurposed as the number of MICE iterations.
    
    Parameters:
      df (pd.DataFrame): Input DataFrame with missing values.
      continuous_cols (list of str): Names of continuous numeric columns.
      discrete_cols (list of str): Names of discrete numeric columns.
      categorical_cols (list of str): Names of categorical columns (label encoded).
      iters (int): Number of MICE iterations.
      strat: ['normal', 'shap', 'fast'] or a dictionary 
    
    Returns:
      pd.DataFrame: New DataFrame with imputed values. Discrete and categorical 
                  columns are rounded to integers.
    """
    # Create a copy of the DataFrame to avoid modifying the original data.
    df_imputed = df.copy()
    
    # Create an imputation kernel using miceforest.
    # The entire DataFrame is used here since all columns are numeric after preprocessing.
    kernel = mf.ImputationKernel(
        df_imputed,
        random_state=0, 
        mean_match_strategy=strat
    )
    
    # Run the MICE algorithm for the specified number of iterations.
    kernel.mice(iterations=iters)
    
    # Retrieve the completed data (imputed dataset).
    df_completed = kernel.complete_data(dataset=0)
    
    # For discrete and categorical columns, round the imputed values to integers.
    if discrete_cols:
        df_completed[discrete_cols] = df_completed[discrete_cols].round().astype(int)
    if categorical_cols:
        df_completed[categorical_cols] = df_completed[categorical_cols].round().astype(int)
    
    return df_completed

mice_imputed = mice_forest_impute_df(df3, cont, dis, cat, 2, strat='shap')
mice_imputed

  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][assign_col_index] = candidate_preds
  self.candidate_preds[variable][a

Unnamed: 0,side,gccaverarage1,supgcc1,infgcc1,intraeye1,flv1,glv1,wholethick1,thicksuphemi1,thickinfhem1,...,deepvdinf1_injured_D,SSI1_injured,FAZ1_injured,PERIM1_injured,FD1_injured,N1000_injured,N500_injured,CENTR_injured,T500_injured,T1000_injured
0,1,98,96,100,-4,1.12,1.26,278,280,279,...,41.7,0.3,0.361,2.132,48.28,282,266,256,253,262
1,1,105,104,106,-2,0.04,0.07,314,314,315,...,43.2,0.6,0.26,1.911,54.63,389,392,416,395,386
2,1,101,102,104,2,0.27,0.45,284,286,277,...,28.6,0.7,0.262,2.045,54.68,185,225,194,179,170
3,0,90,88,92,-4,0.37,5.92,274,281,274,...,39.9,0.3,1.044,4.173,29.61,247,272,285,277,271
4,0,105,105,102,1,0.03,0.84,296,288,295,...,2.9,0.1,0.304,2.271,3.86,285,315,341,307,294
5,1,106,103,104,-5,0.01,0.1,294,310,295,...,33.6,0.6,0.244,1.92,49.63,204,210,225,208,193
6,0,108,107,109,-2,0.23,0.1,285,288,283,...,49.5,0.8,0.347,2.029,52.78,176,222,241,205,167
7,0,92,92,94,0,0.26,3.35,281,284,279,...,35.5,0.4,0.148,1.478,27.84,185,217,233,193,159
8,1,103,100,106,-6,0.05,0.29,294,294,295,...,50.1,0.4,0.151,1.468,58.54,182,198,207,173,153
9,1,95,91,92,-2,0.23,1.08,282,288,277,...,57.6,0.8,0.374,2.312,59.15,336,352,386,307,339


In [62]:
calculate_metrics(df2, df3_mask, mice_imputed, 'mice_1_2', cont, dis, cat)

{'Method': 'mice_1_2', 'MAE': 9.217934048037309, 'Accuracy': 0.625}

In [66]:
calculate_metrics(df2, df3_mask, mice_imputed, 'mice_shap_1_2', cont, dis, cat)

{'Method': 'mice_shap_1_2', 'MAE': 10.00640548079135, 'Accuracy': 0.5}

In [None]:
def missforest_impute_df(df, continuous_cols=None, discrete_cols=None, categorical_cols=None, iters=5):
    df_imputed = df.copy()
    
    imputer = MissForest(max_iter=iters, categorical=categorical_cols )
    
    df_imputed_result = imputer.fit_transform(df_imputed)
    
    # # For discrete and categorical columns, round the imputed values to integers.
    if discrete_cols:
        df_imputed_result[discrete_cols] = df_imputed_result[discrete_cols].round().astype(int)

    return df_imputed_result

mf_imputed = missforest_impute_df(df3, cont, dis, cat )
mf_imputed

100%|██████████| 5/5 [00:30<00:00,  6.20s/it]
100%|██████████| 5/5 [00:12<00:00,  2.45s/it]


Unnamed: 0,intraeye1,deepvdinf1_injured_C,N1000,deepvdwh1_injured,supvdinf1_injured_C,N1000_injured,supvdsup1_C,deepvdsup1_injured_C,supvdsup1,supvdinf1_injured_A,...,infgcc1_injured,wholethick1_injured,T1000,supwholevd1,deepvdsupO1,supvdpa1_injured,flv1,thicktempo1_injured_A,deepvdwh1,supvdinfe1_injured
0,-4,44.8,163,46.3,44.1,282,51.2,47.7,50.7,43.4,...,104,276,185,74.199975,55.2,45.99727,0.069765,255,49.3,43.0
1,-2,43.5,305,43.9,52.7,389,52.8,43.1,53.6,53.5,...,112,314,223,54.1,57.6,52.5,0.04,273,52.1,52.4
2,2,31.7,230,42.255177,48.1,185,51.8,33.3,50.3,48.5,...,107,284,219,50.3,48.0,45.99727,0.27,273,43.4,48.1
3,-4,34.7,284,30.2,41.3,247,49.851726,30.1,38.7,34.0,...,104,276,262,74.199975,43.6,32.1,0.069765,273,47.527481,45.012155
4,1,5.7,256,5.6,48.651724,285,57.3,6.5,56.5,11.0,...,120,293,263,55.8,52.737706,45.99727,0.069765,273,43.0,8.3
5,-5,36.4,194,36.8,41.5,204,50.0,36.1,58.162061,48.3,...,104,276,185,74.199975,52.737706,47.7,0.01,291,47.9,49.4
6,-2,50.3,179,49.5,54.0,176,53.5,49.9,53.2,55.3,...,111,276,167,74.199975,54.6,55.6,0.23,273,47.527481,56.3
7,0,37.3,213,35.6,40.8,185,47.1,25.2,58.162061,39.5,...,104,284,196,74.199975,57.5,45.99727,0.26,259,49.1,37.3
8,-6,51.6,173,48.1,56.5,182,49.851726,51.5,54.4,58.7,...,104,296,223,54.6,56.2,58.5,0.05,277,55.1,45.012155
9,-2,56.1,253,53.1,53.8,336,47.7,53.1,51.0,56.3,...,104,276,247,51.1,56.0,54.5,0.069765,266,47.8,54.6


In [79]:
calculate_metrics(df2, df3_mask, mf_imputed, 'missforest', cont, dis, cat)

{'Method': 'missforest', 'MAE': 14.086886039323858, 'Accuracy': 0.375}