<h1><center><u> Code Implementation </u></center></h1>

# cross_validation

In [None]:
#importing Libraries
import os
import time
import pickle
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from icecream import ic
import tensorflow as tf
from scipy.stats import skew
from tensorflow import keras
from tabulate import tabulate
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from tensorflow.keras import Model
from collections import defaultdict
import tensorflow.keras.backend as K
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from tensorflow.keras.losses import mse
from tensorflow.keras import Sequential
from sklearn.impute import SimpleImputer
from scipy.stats import levene, f_oneway
from tensorflow.keras import backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L2
from tensorflow.keras.optimizers import Adam
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from tensorflow.keras.initializers import he_uniform
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import mse as keras_mse
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from tensorflow.keras.layers import Lambda, Layer, Input, Dense
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.model_selection import RepeatedKFold, train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.layers import Dense, Input, GaussianNoise, Layer
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from tensorflow.keras import Sequential, layers, models, optimizers, regularizers
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures, FunctionTransformer

from cross_val_models_statistics import *
from layersconfig import *

# initialization
from pandarallel import pandarallel
pandarallel.initialize()
from dhs_modelling_functions_new import final_ds_droping_cols, fold_generator

In [None]:
# setting up random seeds for reproducibility
tf.random.set_seed(6688)
random.seed(6688)
np.random.seed(6688)

# Dataset loading

reference from https://github.com/gheisenberg/FoodSecurity/tree/main/DHS

# configurations and directory creation

In [None]:
config = {
    'scale_numerical_data': True,
    'masking': True,
    'drop_countries': True,
    'egypt_dropping': True,
    'process_nans': 'numerical_only_drop_20_percentage_nans',  # 'drop_all_nans' or 'numerical_only' or 'numerical_only_drop_20_percentage_nans'
    'drop_columns': ['Meta; adm0_gaul', 'Meta; GEID_init', 'Meta; year'],
    'countries_to_drop': ['Egypt', 'Comoros', 'Central African Republic'],
    'missingness_fraction': 0.3,
    'base_dir': '/home/myuser/prj/code/final_computation/cross_validation_github'
}
def setup_directories(config):

    # checking the process_nans condition 
    if config['process_nans'] == 'numerical_only':
        base_dir = '/home/myuser/prj/code/final_computation/cross_validation_github/keeping_all_numerical'
    elif config['process_nans'] == 'drop_all_nans':
        base_dir = '/home/myuser/prj/code/final_computation/cross_validation_github/drop_all_nans'
    elif config['process_nans'] == 'numerical_only_drop_20_percentage_nans':
        base_dir = '/home/myuser/prj/code/final_computation/cross_validation_github/drop_20_percentage'

    # updating the base directory in config
    config['base_dir'] = base_dir
    # creation of masking directories
    with_masking_dir = os.path.join(base_dir, 'with_masking')
    without_masking_dir = os.path.join(base_dir, 'without_masking')
    # ensuring that the base directories exist (with_masking and without_masking)
    os.makedirs(with_masking_dir, exist_ok=True)
    os.makedirs(without_masking_dir, exist_ok=True)

    return with_masking_dir, without_masking_dir

with_masking_dir, without_masking_dir = setup_directories(config)
# parsing layer configuration
def parse_layer_config(folder_name):
    try:
        return list(map(int, folder_name.split('_')))
    except ValueError:
        return None

## dropping all nans from dataset

In [None]:
def drop_all_nans(df):
    non_missing_indices = {}
    data_with_missing = df.copy()
    for col in data_with_missing.columns:
        non_missing_indices[col] = data_with_missing[col].dropna().index
    
    values_df = pd.DataFrame()
    for col, indices in non_missing_indices.items():
        values_df[col] = data_with_missing.loc[indices, col].reset_index(drop=True)
    
    actual_values_df = values_df.dropna()
    rows = actual_values_df.shape
    print(f"Shape after dropping all NaNs: {rows}")
    
    return actual_values_df

In [None]:
def calculate_initial_missingness(df):
    total_elements = df.size
    missing_elements = df.isnull().sum().sum()
    return missing_elements / total_elements

# loading data

In [None]:
def load_data(config):
    input_dir = "/home/myuser/data/preprocessed_data/DHS_n_more/"
    dataset_type = 'HR'
    group_by_col = 'adm2_gaul'
    urban_rural_all_mode = 'all'
    drop_agriculture = False

    in_f = f"{input_dir}5_grouped_df_V3_{dataset_type}_{group_by_col}_joined_with_ipc_{urban_rural_all_mode}.pkl"
    df = pd.read_pickle(in_f)

    initial_missingness= calculate_initial_missingness(df)

    df = final_ds_droping_cols(df, drop_meta=True, drop_food_help=True, drop_perc=40,
                               retain_month=False, drop_highly_correlated_cols=False, drop_region=True, 
                               drop_data_sets=['Meta one-hot encoding', 'Meta frequency encoding'], 
                               use_NAN_amount_and_replace_NANs_in_categorical=False, drop_agricultural_cols=drop_agriculture, 
                               drop_below_version=False, numerical_data=['mean'], retain_adm=False, 
                               retain_GEID_init=False, verbose=3)
    
    # dropping unnecessary columns
    drop_cols = [c for c in df.columns if 'FS;' in c and '0-2y' not in c]
    df = df.drop(columns=drop_cols)
    df.drop(columns=['DHS Cat; translator used: not at all', 'DHS Cat; translator used: yes'], axis=1, inplace=True)

    # dropping countries with different data shifts
    if config['egypt_dropping']:
        dropping = config['countries_to_drop']
        df = df[~df['Meta; adm0_gaul'].isin(dropping)]

    # handling NaNs based on the process_nans condition
    if config['process_nans'] == 'drop_all_nans':
        # dropping all NaNs and return the dataframe immediately
        df = drop_all_nans(df)
        return df, initial_missingness
    
    elif config['process_nans'] == 'numerical_only':
        # if 'numerical_only', just drop Egypt and return without further processing
        return df, initial_missingness
    
    elif config['process_nans'] == 'numerical_only_drop_20_percentage_nans':
        # performing drop Egypt first and then apply the 20% missingness logic
        if 'Meta; GEID_init' in df.columns:
            # The missingness percentage for each survey
            survey_missingness = df.groupby('Meta; GEID_init').apply(lambda x: x.isna().mean().mean())
            print(f"Original dataframe shape: {df.shape}")
            
            # Filter out surveys with missingness above 20%
            surveys_to_keep = survey_missingness[survey_missingness <= 0.2].index
            filtered_df = df[df['Meta; GEID_init'].isin(surveys_to_keep)]
            
            df = filtered_df.copy()
            print(f"Filtered dataframe shape: {filtered_df.shape}")
        else:
            print("Warning: 'Meta; GEID_init' column not found. Skipping survey filtering.")

        return df, initial_missingness

    return df, initial_missingness

In [None]:
input_df, initial_missingness =load_data(config)
input_df.shape

## customized fold generator function for cross-val step

In [None]:
def fold_generator_3_independent_indices(data, split_type, n_splits=5, verbose=1, val_size=0.2):
    """
    Generate indices for train, validation and test sets based on the specified split type.

    Parameters:
    data (DataFrame): The input dataset.
    split_type (str): The type of split - 'country', 'survey', or 'year'.
    n_splits (int): Number of splits/folds for the outer cross-validation.
    verbose (int): Level of verbosity.
    val_size (float): Proportion of the dataset to include in the validation split.
    """
    if split_type == 'country':
        split_col = 'Meta; adm0_gaul'
    elif split_type == 'survey':
        split_col = 'Meta; GEID_init'
    elif split_type == 'year':
        split_col = 'Meta; rounded year'
        # Ensure 'Meta; rounded year' column is created outside this function or create here based on logic provided
        data[split_col] = data.groupby('Meta; GEID_init')['Meta; year'].transform(lambda x: round(x.mean()))
    elif split_type == 'unconditional':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        for train_val_idx, test_idx in kf.split(data):
            # Split the train_val indices into training and validation indices
            train_idx, val_idx = train_test_split(train_val_idx, test_size=val_size, random_state=42)
            yield data.index[train_idx], data.index[val_idx], data.index[test_idx]
        return
    else:
        raise ValueError(f'Invalid split_type: {split_type}')

    unique_combinations = data[split_col].drop_duplicates().values

    # Adjust maximum n_splits based on the number of unique combinations
    if len(unique_combinations) < n_splits or n_splits == -1:
        n_splits = len(unique_combinations)
        if verbose:
            print(f'Adjusting n_splits to the length of unique combinations ({n_splits}) for', split_type)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_val_combinations, test_combinations in kf.split(unique_combinations):
        # Split the train_val combinations into training and validation combinations
        train_combinations, val_combinations = train_test_split(train_val_combinations, test_size=val_size, random_state=42)
        
        # Create masks for training, validation, and test sets
        train_mask = data[split_col].isin(unique_combinations[train_combinations])
        val_mask = data[split_col].isin(unique_combinations[val_combinations])
        test_mask = data[split_col].isin(unique_combinations[test_combinations])
        
        # Get the indices for training, validation, and test sets
        train_indices = data[train_mask].index.values
        val_indices = data[val_mask].index.values
        test_indices = data[test_mask].index.values
        
        # Yielding the indices for train, validation and test sets
        yield train_indices, val_indices, test_indices

## normalizing data if true

In [None]:
def prepare_data(X_train, X_val, X_test, config):
    if config['process_nans'] == 'drop_all_nans':
        X_train, X_val, X_test = (
        X_train.drop(columns=config['drop_columns']), 
        X_val.drop(columns=config['drop_columns']), 
        X_test.drop(columns=config['drop_columns'])
    )
    if config['scale_numerical_data']:
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    else:
        X_train_scaled, X_val_scaled, X_test_scaled = X_train, X_val, X_test
    
    return X_train_scaled, X_val_scaled, X_test_scaled

# GAIN method

In [None]:
# adaptation from: https://github.com/DeltaFloflo/imputation_comparison/blob/main/build/class_gain.py
# -----------------------
# Normalization Utilities
# -----------------------

def normalization(data, norm_params=None):
    """
    Normalize the data column-wise to a [0, 1] range.
    Args:
        data (numpy array): Input data to normalize.
        norm_params (dict): Optional parameters for normalization (min and max values).

    Returns:
        norm_data (numpy array): Normalized data.
        norm_params (dict): Normalization parameters used.
    """
    N, D = data.shape
    if norm_params is None:
        min_val = np.zeros(D)
        max_val = np.zeros(D)
        norm_data = data.copy()
        for d in range(D):
            m1 = np.nanmin(data[:, d])
            m2 = np.nanmax(data[:, d])
            min_val[d] = m1
            max_val[d] = m2
            norm_data[:, d] = (data[:, d] - m1) / (m2 - m1 + 1e-6)
        norm_params = {"min_val": min_val, "max_val": max_val}
    else:
        min_val = norm_params["min_val"]
        max_val = norm_params["max_val"]
        norm_data = data.copy()
        for d in range(D):
            m1 = min_val[d]
            m2 = max_val[d]
            norm_data[:, d] = (data[:, d] - m1) / (m2 - m1 + 1e-6)
    return norm_data, norm_params

def renormalization(norm_data, norm_params):
    """
    Reverse normalization to original scale.
    Args:
        norm_data (numpy array): Normalized data.
        norm_params (dict): Parameters used for normalization.

    Returns:
        data (numpy array): Data rescaled to original range.
    """
    N, D = norm_data.shape
    min_val = norm_params["min_val"]
    max_val = norm_params["max_val"]
    data = norm_data.copy()
    for d in range(D):
        m1 = min_val[d]
        m2 = max_val[d]
        data[:, d] = norm_data[:, d] * (m2 - m1 + 1e-6) + m1
    return data

# ----------------------
# Model Reset Utility
# ----------------------

def reset_weights(model):
    """
    Completely reinitialize model parameters
    """
    for layer in model.layers:
        if layer.name[:5] == "dense":
            # getting the shape of the kernel (weights) to determine the input and output dimensions
            kernel_shape = layer.kernel.shape  # alternatively, layer.weights[0].shape
            
            nb_in = kernel_shape[0]  # input dimension
            nb_out = kernel_shape[1]  # output dimension
            limit = np.sqrt(6.0 / (nb_in + nb_out))
            
            # reinitializing the kernel (weights) and bias
            r1 = np.random.uniform(-limit, limit, size=kernel_shape)
            r2 = np.zeros(shape=layer.bias.shape)
            
            layer.set_weights([r1, r2])
        
        elif layer.name[:19] == "batch_normalization":
            # getting the shape of the gamma (scaling factor) to initialize batch normalization parameters
            gamma_shape = layer.gamma.shape  # alternatively, layer.weights[0].shape
            
            r1 = np.ones(shape=gamma_shape)  # gamma
            r2 = np.zeros(shape=gamma_shape)  # beta
            r3 = np.zeros(shape=gamma_shape)  # moving mean
            r4 = np.ones(shape=gamma_shape)  # moving variance
            
            layer.set_weights([r1, r2, r3, r4])

# ----------------------------------------
# Mask Distribution and Sampling Functions
# ----------------------------------------

def maskDistribution(dataset):
    """
    unique_masks: list of unique NaN masks found in the dataset
    count_masks: corresponding number of occurrences (the probability distrib.)
    """
    mask = (1.0 - np.isnan(dataset)).astype("int")
    unique_masks = np.unique(mask, axis=0)
    count_masks = np.zeros(len(unique_masks), dtype="int")
    for i1 in range(mask.shape[0]):
        current_mask = mask[i1]
        i2 = np.where((unique_masks == current_mask).all(axis=1))[0][0]
        count_masks[i2] += 1
    return unique_masks, count_masks

def drawMasks(unique_masks, probs, N):
    """
    unique_masks: list of unique masks from which to choose
    probs: vector of probability (should sum up to one)
    N: number of samples to draw
    masks: list of size N containing one mask per row drawn from the desired distribution
    """
    multinom = np.random.multinomial(n=1, pvals=probs, size=N)
    indices = np.where(multinom==1)[1]
    masks = unique_masks[indices]
    return masks

def drawHintMatrix(p, nb_rows, nb_cols):
    """
    Generate a hint matrix for GAIN training.
    Args:
        p (float): Probability of 1s in the hint matrix.
        nb_rows (int): Number of desired rows in the matrix H.
        nb_cols (int): Number of desired columns in the matrix H.

    Returns:
        H (numpy array): Hint matrix.
    """
    H = np.random.uniform(0., 1., size=(nb_rows, nb_cols))
    H = 1.0 * (H < p)
    return H

# --------------------------------------------
# GAIN Generator and Discriminator Builders
# --------------------------------------------

# generator network of GAIN for num+cat dataset
def make_GAINgen(dim):
    """
    Create the generator model for GAIN.
    Args:
        dim (int): Dimensionality of the input data.

    Returns:
        model (tf.keras.Sequential): Generator model.
    """
    model = Sequential()
    model.add(Dense(475, activation="elu", input_shape=(2*dim,)))
    model.add(BatchNormalization())
    model.add(Dense(855, activation="elu"))
    model.add(BatchNormalization())
    model.add(Dense(855, activation="elu"))
    model.add(BatchNormalization())
    model.add(Dense(dim, activation="linear"))
    return model

# discriminator network of GAIN
def make_GAINdisc(dim):
    """
    Create the discriminator model for GAIN.
    Args:
        dim (int): Dimensionality of the input data.

    Returns:
        model (tf.keras.Sequential): Discriminator model.
    """
    model = Sequential()
    model.add(Dense(475, activation="elu", input_shape=(2*dim,))) # specifically manually configured for categorical with numerical dataset
    model.add(Dropout(rate=0.4))
    model.add(Dense(855, activation="elu"))  
    model.add(Dropout(rate=0.4))
    model.add(Dense(855, activation="elu"))  
    model.add(Dropout(rate=0.4))
    model.add(Dense(dim, activation="sigmoid"))
    return model

# -------------------------------------------
# GAIN Code Version 2 (Training + Imputation)
# -------------------------------------------

class GAIN_code_v2:
    """
    GAIN (Generative Adversarial Imputation Networks) implementation for imputing missing data.
    """
    def __init__(self, dim):
        """
        Initialize the GAIN model with generator and discriminator.
        Args:
            dim (int): Dimensionality of the input data.
        """
        self.dim = dim
        self.G = make_GAINgen(dim) # generator
        self.D = make_GAINdisc(dim) # discriminator
        self.Goptim = tf.keras.optimizers.Adam(0.001) # reconstruction vs adversarial loss weight # Weight for reconstruction loss
        self.Doptim = tf.keras.optimizers.Adam(0.001) # % of known features revealed to discriminator
        self.alpha = 50
        self.hint_rate = 0.9
        self.trained = False
        self.nb_epochs = 0
        self.Gloss1 = [] # adversarial loss
        self.Gloss2 = [] # reconstruction loss
        self.Dloss = []

    # discriminator loss: Binary cross-entropy with hint masking
    @staticmethod
    def compute_D_loss(D_output, M, H):
        """
        Compute the discriminator loss during training.
        """
        L1 = M * tf.math.log(D_output + 1e-6)
        L2 = (1.0 - M) * tf.math.log(1.0 - D_output + 1e-6)
        L = - (L1 + L2) * tf.cast((H == 0.5), dtype=tf.float32)
        nb_cells = tf.math.reduce_sum(tf.cast((H == 0.5), dtype=tf.float32))
        return tf.math.reduce_sum(L) / nb_cells if nb_cells > 0 else 0.0

    # generator loss: fool discriminator + minimize reconstruction error
    @staticmethod
    def compute_G_loss(G_output, D_output, X, M, H):
        """
        Compute the generator loss during training.
        """
        Ltemp = - ((1.0 - M) * tf.math.log(D_output + 1e-6))
        L = Ltemp * tf.cast((H == 0.5), dtype=tf.float32)
        nb_cells1 = tf.math.reduce_sum(tf.cast((H == 0.5), dtype=tf.float32))
        loss1 = tf.math.reduce_sum(L) / nb_cells1 if nb_cells1 > 0 else 0.0
        squared_err = ((X - G_output) ** 2) * M
        nb_cells2 = tf.math.reduce_sum(M)
        loss2 = tf.math.reduce_sum(squared_err) / nb_cells2 if nb_cells2 > 0 else 0.0
        return loss1, loss2

    # reset model for fresh training
    def reinitialize(self):
        """
        Reinitialize the weights of both generator and discriminator models.
        
        This is useful for resetting the models to their initial states before retraining.
        Also clears the training history (losses and epoch counter).
        """
        reset_weights(self.G)
        reset_weights(self.D)
        self.trained = False
        self.nb_epochs = 0
        self.Gloss1 = []
        self.Gloss2 = []
        self.Dloss = []

    # single training step (compiled for speed with @tf.function)
    @tf.function  
    def train_step(self, batch_data):
        """
        Perform a single training step for the GAIN model.

        Args:
            batch_data (tf.Tensor): Batch of data with missing values (NaNs).

        Returns:
            G_loss1 (float): Generator adversarial loss for the batch.
            G_loss2 (float): Generator reconstruction loss for the batch.
            D_loss (float): Discriminator loss for the batch.
        
        Steps:
        1. Generate a mask matrix `M` indicating observed values (1) and missing values (0).
        2. Replace missing values with random noise to create `X`.
        3. Train the generator (`G`) and discriminator (`D`) using separate gradient updates.
        4. Calculate and return the losses for monitoring.
        """
        cur_batch_size = batch_data.shape[0]
        noise = tf.random.normal([cur_batch_size, self.dim], dtype=tf.float32)
        batch_data = tf.cast(batch_data, dtype=tf.float32)  # Ensure batch_data is float32
        M = 1.0 - tf.cast(tf.math.is_nan(batch_data), dtype=tf.float32)  # 0=NaN, 1=obs.
        X = tf.where(tf.math.is_nan(batch_data), noise, batch_data)
        G_input = tf.concat((X, M), axis=1)
    
        with tf.GradientTape() as G_tape, tf.GradientTape() as D_tape:
            G_output = self.G(G_input, training=True)
            X_hat = X * M + G_output * (1.0 - M)
            Htemp = tf.cast(drawHintMatrix(self.hint_rate, cur_batch_size, self.dim), dtype=tf.float32)
            H = M * Htemp + 0.5 * (1.0 - Htemp)
            D_input = tf.concat((X_hat, H), axis=1)
            D_output = self.D(D_input, training=True)
    
            D_loss = self.compute_D_loss(D_output, M, H)
            G_loss1, G_loss2 = self.compute_G_loss(G_output, D_output, X, M, H)
            G_loss = G_loss1 + self.alpha * G_loss2
    
            G_gradients = G_tape.gradient(G_loss, self.G.trainable_variables)
            D_gradients = D_tape.gradient(D_loss, self.D.trainable_variables)
    
            self.Goptim.apply_gradients(zip(G_gradients, self.G.trainable_variables))
            self.Doptim.apply_gradients(zip(D_gradients, self.D.trainable_variables))
    
            return G_loss1, G_loss2, D_loss

    # full training loop
    def train(self, dataset, batch_size, epochs):
        """
        Train the GAIN model on the given dataset.

        Args:
            dataset (numpy array): Dataset containing missing values (NaNs).
            batch_size (int): Number of samples per batch.
            epochs (int): Number of epochs for training.

        Process:
        - For each epoch, the dataset is divided into batches.
        - Each batch is passed through `train_step` to update the model weights.
        - Losses (adversarial, reconstruction, and discriminator) are recorded for each epoch.
        """
        for epoch in range(epochs):
            G_temp1, G_temp2, D_temp = [], [], []
            for batch_idx in range(0, dataset.shape[0], batch_size):
                batch_data = dataset[batch_idx:batch_idx + batch_size]
                G_loss1, G_loss2, D_loss = self.train_step(batch_data)
                G_temp1.append(G_loss1.numpy())
                G_temp2.append(G_loss2.numpy())
                D_temp.append(D_loss.numpy())
            self.Gloss1.append(np.mean(G_temp1))
            self.Gloss2.append(np.mean(G_temp2))
            self.Dloss.append(np.mean(D_temp))

    # imputing missing values using the trained generator
    def impute(self, nandata):
        """
        Impute missing values in the dataset using the trained GAIN model.

        Args:
            nandata (numpy array): Dataset containing missing values (NaNs).

        Returns:
            imputed_data (numpy array): Dataset with missing values replaced by imputed values.
        
        Process:
        - Missing values are replaced by the generator's output.
        - Observed values remain unchanged.
        """
        noise = tf.random.normal([nandata.shape[0], self.dim])
        M_impute = 1.0 - np.isnan(nandata)
        X_impute = tf.where((M_impute == 0.0), noise, nandata)
        G_input = tf.concat((X_impute, M_impute), axis=1)
        G_output = self.G(G_input, training=False)
        imputed_data = (X_impute * M_impute + G_output * (1.0 - M_impute)).numpy()
        return imputed_data

## Step 1: Running Final Cross-Validation for Best-Tuned Architectures

In [None]:
# this section initializes fold-level execution across all imputers (AE, DAE, VAE, GAIN, etc.)
# using 5-fold cross-validation to benchmark methods' performances.

def run_scenario(config, masking=True):
    """
    Executes a full pipeline across cross-validation folds to evaluate imputation methods
    like AE, DAE, VAE, and GAIN. Applies masking, KNN preprocessing, scaling, and stores metrics.
    """
    # --- 1.1 Setup directories and read input data
    masking_dir = os.path.join(config['base_dir'], 'with_masking' if masking else 'without_masking')
    
    imputation_dir_knn = os.path.join(masking_dir, 'baseline_imputations')
    os.makedirs(imputation_dir_knn, exist_ok=True)
    
    imputation_dir_gain=os.path.join(masking_dir, 'gain_imputation')
    imputation_dir_gain_v2 = os.path.join(masking_dir, 'gain_imputation_v2')
    os.makedirs(imputation_dir_gain_v2, exist_ok=True)
    
    input_df, initial_missingness = load_data(config)
    rows = input_df.shape[0]
    # n_splits=5
    fold_gen = list(fold_generator_3_independent_indices(input_df, split_type='survey', n_splits=5)) 
    print(f"Cross-validation is executing with masking={masking}...")
    imputer_name = 'KNN_initial_Imputer'

    # initializing result trackers
    # --- 1.2 Initialize metric containers
    results_list, survey_r2_excel_list, column_metrics = [], [], []
    fold_info_list, folds_data = [], []
    overall_rmse_values, overall_r2_values, overall_correlation_values = [], [], []
    all_folds_stats = {} # initializing the fold statistics as a dictionary

    results_dir = config['base_dir']
    missingness_fraction = config['missingness_fraction']

    for fold, (train_index, val_index, test_index) in enumerate(fold_gen):
        print(f"Processing fold {fold} with masking={masking}")
        # --- 1.3 Load and preprocess the fold data
        X_train, X_val, X_test = input_df.loc[train_index], input_df.loc[val_index], input_df.loc[test_index]
        test_surveys = X_test['Meta; GEID_init'].copy()
        X_train_ori = X_train.copy()
        X_val_ori = X_val.copy()
        X_test_ori = X_test.copy()

        if config['process_nans'] == 'numerical_only' or config['process_nans'] == 'numerical_only_drop_20_percentage_nans':
        # if config['process_nans'] == 'numerical_only_drop_20_percentage_nans': #'numerical_only_drop_20_percentage_nans'    
            X_train = X_train.select_dtypes(include=[np.number])
            X_val = X_val.select_dtypes(include=[np.number])
            X_test= X_test.select_dtypes(include=[np.number])
            # directory for saving imputed data
            imputation_dir_numerical = os.path.join(masking_dir, 'numerical_only_initial_fold_imputation_step_1')
            os.makedirs(imputation_dir_numerical, exist_ok=True)

            # defining file paths for saving imputed data and
            # apply or loading KNN-imputed data
            train_imputed_file = os.path.join(imputation_dir_numerical, f"train_imputed_fold_{fold}.pkl")
            val_imputed_file = os.path.join(imputation_dir_numerical, f"val_imputed_fold_{fold}.pkl")
            test_imputed_file = os.path.join(imputation_dir_numerical, f"test_imputed_fold_{fold}.pkl")

            if os.path.exists(train_imputed_file) and os.path.exists(val_imputed_file) and os.path.exists(test_imputed_file):
                print(f"Loading pre-imputed data for fold {fold}...")
                with open(train_imputed_file, 'rb') as f:
                    X_train = pickle.load(f)
                with open(val_imputed_file, 'rb') as f:
                    X_val = pickle.load(f)
                with open(test_imputed_file, 'rb') as f:
                    X_test= pickle.load(f)
            else:
                print(f"Applying KNN imputation for fold {fold}...")

                # applying KNN imputation
                imputer = KNNImputer(n_neighbors=5)
                X_train = pd.DataFrame(imputer.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
                X_val = pd.DataFrame(imputer.transform(X_val), index=X_val.index, columns=X_val.columns)
                X_test = pd.DataFrame(imputer.transform(X_test), index=X_test.index, columns=X_test.columns)

                # saving the imputed data
                with open(train_imputed_file, 'wb') as f:
                    pickle.dump(X_train, f)
                with open(val_imputed_file, 'wb') as f:
                    pickle.dump(X_val, f)
                with open(test_imputed_file, 'wb') as f:
                    pickle.dump(X_test, f)
                print(f"Imputed data saved for fold {fold}.")

        else:
            print(f"Skipping KNN imputation and drop_all_nans for fold {fold}. No imputation will be performed.")
            X_train, X_val, X_test = X_train, X_val, X_test

        # storing fold information
        fold_info_list.append({
            'Fold': fold,
            'Train Shape': X_train_ori.shape,
            'Validation Shape': X_val_ori.shape,
            'Test Shape': X_test_ori.shape,
            'Surveys in Train': X_train_ori['Meta; GEID_init'].unique().tolist(),
            'Surveys in Validation': X_val_ori['Meta; GEID_init'].unique().tolist(),
            'Surveys in Test': X_test_ori['Meta; GEID_init'].unique().tolist(),
            'Countries in Train': X_train_ori['Meta; adm0_gaul'].unique().tolist(),
            'Countries in Validation': X_val_ori['Meta; adm0_gaul'].unique().tolist(),
            'Countries in Test': X_test_ori['Meta; adm0_gaul'].unique().tolist()
        })

        # --- 1.4 Scale and mask data
        # scaling data
        X_train_scaled, X_val_scaled, X_test_scaled = prepare_data(X_train, X_val, X_test, config)

        # applying masking with mcar and mar missigness
        X_val_with_missing, _ = apply_masking(X_val_scaled.copy(), masking, config['missingness_fraction'])
        X_test_with_missing, _= apply_masking(X_test_scaled.copy(), masking, config['missingness_fraction'])

        # --- 1.5 Initial KNN for AE/DAE/VAE input
        # imputing data initially with knn for aes
        X_val_imputed, X_test_imputed = initial_knn_imputed_data(X_train_scaled, X_val_with_missing, X_test_with_missing, fold, imputation_dir_knn)
        # saving fold data
        save_fold_data(fold, X_train, X_val, X_test, X_train_scaled, X_val_scaled, X_test_scaled, X_val_imputed, X_test_imputed, config, masking)

        # --- 1.6 Prepare noisy input for DAE
        noise_factor = 0.2
        X_train_scaled_noisy = X_train_scaled + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=X_train_scaled.shape)
        X_train_scaled_noisy = np.clip(X_train_scaled_noisy, 0., 1.)

        ############################## aes' imputation and evaluation of aes' performance #########################

        # clear any previous TensorFlow state
        tf.keras.backend.clear_session()

        # --- 1.7 Run AE, DAE, VAE
        # running autoencoder models
        # Auto + Manual both (recommended default)
        layers_configurations, vae_layers_configuration = get_layer_configurations(config, input_columns_amount=95, use_automated_layers=True,
                                                                                  combine_layers=True,  # True = auto + manual, False = only auto
                                                                                  max_configs=10)

        aes_rmse, aes_r2, aes_corr = run_models(X_train_scaled, X_val_imputed, X_test_imputed, X_val_scaled, X_train_scaled_noisy, 
                                                layers_configurations, vae_layers_configuration, optimizer_configs, 
                                                activations, X_test_scaled, test_surveys, fold, masking_dir,
                                                rows, initial_missingness, config['missingness_fraction'], masking,
                                                results_list, survey_r2_excel_list, column_metrics)
        # storing metrics for ae, dae, and vae
        overall_rmse_values.append(aes_rmse)
        overall_r2_values.append(aes_r2)
        overall_correlation_values.append(aes_corr)

        # --- 1.8 Run baseline imputers
        # running baseline imputation and evaluation of baseline imputers' performance
        rmse_single, r2_single, corr_single = run_baseline_imputations(X_train_scaled, X_test_with_missing, X_val_with_missing, X_test_scaled, X_val_scaled, 
                                              test_surveys, fold, masking, rows, initial_missingness, config['missingness_fraction'], 
                                              imputation_dir_knn, results_list, survey_r2_excel_list, column_metrics)
        
        # storing metrics
        overall_rmse_values.append(rmse_single)
        overall_r2_values.append(r2_single)
        overall_correlation_values.append(corr_single)

        ###################################### imputing data using GAIN_v2 #######################################
        # clear any previous TensorFlow state
        tf.keras.backend.clear_session()

        gain_model_v2 = GAIN_code_v2(dim=X_train_scaled.shape[1])

        # --- 1.9 Run GAIN_v2
        # running the GAIN method
        gain_rmse_v2, gain_r2_v2, gain_corr_v2 = run_gain_method_v2(X_train_scaled, X_val_scaled, X_test_scaled, X_test_with_missing, 
                                                 gain_model_v2, test_surveys, fold, results_list, survey_r2_excel_list, 
                                                 column_metrics, "GAIN_v2", rows, initial_missingness, 
                                                 missingness_fraction, masking, imputation_dir_gain_v2)
        
        # storing the results
        overall_rmse_values.append(gain_rmse_v2)
        overall_r2_values.append(gain_r2_v2)
        overall_correlation_values.append(gain_corr_v2)

        ###################################### imputing data using GAIN_v2 #######################################

        # --- 1.10 Fold statistics and ANOVA/Levene data
        all_folds_stats = calculate_fold_statistics(X_test, all_folds_stats, fold)
        # storing the test data for Levene and ANOVA calculations
        folds_data.append(X_test_scaled)

    # --- 1.11 Levene’s and ANOVA Tests
    levene_df, anova_df = calculate_levene_anova_stats(folds_data)

    # --- 1.12 Save results and return
    # processing and saving fold statistics, Levene's Test and ANOVA
    results_df, average_metrics, fold_info_df, survey_r2_df, column_metrics_df = process_and_save_fold_statistics(
            results_dir, all_folds_stats, overall_rmse_values, overall_r2_values, overall_correlation_values, 
            results_list, fold_info_list, survey_r2_excel_list, column_metrics, masking, 
            levene_df, anova_df)

    return results_df, average_metrics, fold_info_df, survey_r2_df, column_metrics_df

## Step 2: Execute Masked and Unmasked Scenarios and Save Final Results of Cross validation

This follwing Excel file summarizes the evaluation of imputation methods with and without masking.
It contains **five key sheets**, each serving a distinct purpose:

---

## 1. Sheet: `Combined Results`
- Contains all per-fold evaluation metrics across all models.
- Includes both masked and unmasked missingness scenarios.
- Metrics include:
  - RMSE
  - R2 Score
  - Correlation
  - Method name
  - Fold number
  - Missingness type

---

## 2. Sheet: `Average Results`
- Shows **mean and standard deviation** of RMSE, R2 and correlation.
- Grouped by method and missingness type (with/without masking).
- Helps in comparing overall performance of methods.

---

## 3. Sheet: `Fold Info`
- Stores metadata about each fold:
  - Fold number
  - Train/Val/Test shapes
  - Surveys and countries in each split
- Useful for ensuring fair and consistent fold splits.

---

## 4. Sheet: `Survey R2 Scores`
- R2 scores **per survey** for each method.
- Helps track method performance on specific countries/surveys.
- Important for fairness and generalization analysis.

---

## 5. Sheet: `Column Metrics`
- Evaluation metrics **per column** across folds and methods.
- Shows how well each method imputes individual features.
- Useful for feature-wise imputation quality insights.


In [None]:
# this part runs the complete scenario with and without masking, aggregates results,
# and exports all evaluation metrics into an excel file.

def main():
    config = {
        'scale_numerical_data': True,
        'masking': True,
        'drop_countries': True,
        'egypt_dropping': True,
        'process_nans': 'numerical_only_drop_20_percentage_nans',
        'drop_columns': ['Meta; adm0_gaul', 'Meta; GEID_init', 'Meta; year'],
        'countries_to_drop': ['Egypt', 'Comoros', 'Central African Republic'],
        'missingness_fraction': 0.3,
        'base_dir': '/home/myuser/prj/code/final_computation/cross_validation_github'
    }

    # setting up directories based on config
    setup_directories(config)

    # running scenario with masking
    results_with_masking = run_scenario(config, masking=True)

    # running scenario without masking
    results_without_masking = run_scenario(config, masking=False)

    # combining and save results
    combined_results = [pd.concat([with_masking, without_masking], ignore_index=True)
                        for with_masking, without_masking in zip(results_with_masking, results_without_masking)]

    combined_results_file = os.path.join(config['base_dir'], 'combined_cross_results.xlsx')
    with pd.ExcelWriter(combined_results_file) as writer:
        combined_results[0].to_excel(writer, sheet_name='Combined Results', index=False)
        combined_results[1].to_excel(writer, sheet_name='Average Results', index=False)
        combined_results[2].to_excel(writer, sheet_name='Fold Info', index=False)
        combined_results[3].to_excel(writer, sheet_name='Survey R2 Scores', index=False)
        combined_results[4].to_excel(writer, sheet_name='Column Metrics', index=False)

    print(f"Combined results have been saved to {combined_results_file}")

if __name__ == "__main__":
    main()