<h1><center><u> Code Implementation </u></center></h1>

In [None]:
!pip install pandarallel

In [None]:
!pip install imbalanced-learn

In [None]:
!pip install pyreadstat

In [None]:
!pip install os

In [None]:
#importing Libraries
import os
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Sequential
from keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from icecream import ic
from dateutil.relativedelta import relativedelta
# from dhs_preprocessing_functions import *
from pandarallel import pandarallel
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# initialization
pandarallel.initialize()
from keras import Sequential, layers, regularizers, optimizers
from dhs_modelling_functions import final_ds_droping_cols

In [None]:
#setting up random seeds for reproducibility
tf.random.set_seed(6688)
random.seed(6688)
np.random.seed(6688)

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu,True)
    except RuntimeError as e:
        raise e

<h1><center> <div class="alert alert-danger"> Regular Autoencoder </div> </center></h1>

In [None]:
#importing data
input_df = pd.read_csv("5_grouped_df_V3_HR_adm2_gaul_joined_with_ipc_all.csv")
input_df.head()

In [None]:
# listing all the column names from the dataframe
#all_column_names = input_df.columns.tolist()
#all_column_names

In [None]:
# counting the number of null or missing values in each column
#pd.set_option('display.max_rows', 1620)
#null_counts = input_df.isnull().sum()
#print(null_counts)

In [None]:
numeric_df = final_ds_droping_cols(input_df, drop_meta=True, drop_food_help=True, drop_perc=25, 
                           drop_data_sets=['DHS Cat', 'Meta one-hot encoding', 'Meta frequency encoding'], 
                           numerical_data=['std'], retain_year=True,
                 retain_adm=False, retain_month=False, drop_highly_correlated_cols=False, drop_region=True, verbose=1)

In [None]:
# dropping columns starting with 'Meta' and 'index'
#numeric_df= numeric_df.loc[:, ~numeric_df.columns.str.startswith(('Meta', 'index'))]

In [None]:
string_df = final_ds_droping_cols(input_df.copy(), drop_meta=True, drop_food_help=True, drop_perc=25, 
                           drop_data_sets=['Meta', 'DHS Num', 'Meta one-hot encoding', 'Meta frequency encoding'], 
                           numerical_data=['std'],
                 retain_adm=False, retain_month=False, drop_highly_correlated_cols=False, drop_region=True, verbose=1)

# finalized 'std' only as mean, median, skewness etc are not well imputed

In [None]:
combined_df = final_ds_droping_cols(input_df.copy(), drop_meta=True, drop_food_help=True, drop_perc=30, 
                           drop_data_sets=['Meta', 'Meta one-hot encoding', 'Meta frequency encoding'], 
                           numerical_data=['std'],
                 retain_adm=False, retain_month=False, drop_highly_correlated_cols=False, drop_region=True, verbose=1)

In [None]:
combined_df = combined_df.loc[:, ~combined_df.columns.str.startswith(('index'))]

# Normalizing by multiplication as ae can't capture raw datapoints

In [None]:
# multiplying all columns starting with 'DHS Num' by 100
dhs_num_cols = combined_df.filter(regex='^DHS Num').columns
combined_df[dhs_num_cols] = combined_df[dhs_num_cols] * 100

# multiplying all columns starting with 'FS' by 100
fs_cols = combined_df.filter(regex='^FS').columns
combined_df[fs_cols] = combined_df[fs_cols] * 100

In [None]:
# multiplying all columns starting with 'DHS Cat' by 1000
dhs_cat_cols = combined_df.filter(regex='^DHS Cat').columns
combined_df[dhs_cat_cols] = combined_df[dhs_cat_cols] * 1000

# temporarily filling up missing spaces with median and mode

In [None]:
for col in combined_df.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = combined_df[col].median()
        combined_df[col].fillna(median_value, inplace=True)

In [None]:
for col in combined_df.columns:
    if 'DHS Cat' in col:  
        mode_value = combined_df[col].mode()[0]
        if pd.notna(mode_value): 
            combined_df[col].fillna(mode_value, inplace=True)

In [None]:
pd.set_option('display.max_rows', 500)
null_counts = combined_df.isnull().sum()
print(null_counts)

# dataset columns' are highly skewed with imbalanced datapoints

 - applied smote function to oversample, but could not get balanced datapoints out of DHS Cat

In [None]:
for col in combined_df.columns:
    #plot histogram for the current column
    plt.hist(combined_df[col], bins=30, edgecolor='black')
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for col in combined_df.columns:
    print(col, len(combined_df[col].dropna())/len(combined_df) * 100) #the proportion of non-missing values in the column

In [None]:
pd.set_option('display.max_rows', 500)
null_counts = combined_df.isnull().sum()
print(null_counts)

In [None]:
df1=combined_df.copy()
#splitting the data into train, test, and validation sets
train1, test1 = train_test_split(df1, test_size=0.2, random_state=42)
train1, val1= train_test_split(train1, test_size=0.2, random_state=42)
actual_ae1=test1.copy()

# artificial missingness

In [None]:
# the proportion of rows that will have missing values
missing_row_proportion = 0.3  # 30% of the rows

# the range of the number of columns to have missing values in each row
min_missing_columns = 1  # minimum number of columns with missing values
max_missing_columns = int(0.3 * 18)  # maximum number of columns with missing values (e.g., 20% of 18)

# selecting the rows that will have missing values
n_rows_with_missing = int(test1.shape[0] * missing_row_proportion)
rows_to_have_missing = np.random.choice(test1.index, size=n_rows_with_missing, replace=False)

for i in rows_to_have_missing:
    # a random number of columns for missing values for each row
    n_missing_columns = np.random.randint(min_missing_columns, max_missing_columns)
    cols_to_have_missing = np.random.choice([col for col in test1.columns if col not in ['Meta; year', 'index']], size=n_missing_columns, replace=False)
    test1.loc[i, cols_to_have_missing] = np.nan

In [None]:
test1.head(10)

In [None]:
pd.set_option('display.max_rows', 500)
null_counts = test1.isnull().sum()
print(null_counts)

# filling up missing values temporarily

In [None]:
for col in test1.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = test1[col].median()
        test1[col].fillna(median_value, inplace=True)

In [None]:
for col in test1.columns:
    if col.startswith('DHS Cat'):
        median_value = test1[col].median()
        test1[col].fillna(median_value, inplace=True)

In [None]:
pd.set_option('display.max_rows', 500)
null_counts = test1.isnull().sum()
print(null_counts)

# simple autoencoder

In [None]:
input_dim = train1.shape[1]

# increased bottleneck size
bottleneck_size = 32
# larger and more complex model
final_ae = Sequential()
final_ae.add(layers.Dense(128, activation='relu', input_dim=input_dim, kernel_initializer='he_uniform', activity_regularizer=regularizers.l2(0.001)))
#final_ae.add(layers.Dense(256, activation='relu', kernel_initializer='he_uniform', activity_regularizer=regularizers.l2(0.01)))
#final_ae.add(layers.Dense(128, activation='relu', kernel_initializer='he_uniform', activity_regularizer=regularizers.l2(0.001)))
final_ae.add(layers.Dense(64, activation='relu', kernel_initializer='he_uniform', activity_regularizer=regularizers.l2(0.001)))
# bottleneck layer
final_ae.add(layers.Dense(bottleneck_size, activation='relu', name='bottleneck'))
# decoder part mirroring the encoder
final_ae.add(layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
final_ae.add(layers.Dense(128, activation='relu', kernel_initializer='he_uniform'))
#final_ae.add(layers.Dense(256, activation='relu', kernel_initializer='he_uniform'))
#final_ae.add(layers.Dense(512, activation='relu', kernel_initializer='he_uniform'))
final_ae.add(layers.Dense(input_dim, activation='relu'))  # Adjust the activation function if needed
# customized RMSE function
def root_mean_squared_error(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

# compiling the model
final_ae.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['accuracy', 'mean_absolute_error', root_mean_squared_error])
# model summary
final_ae.summary()
# fitting the model
start_time = time.time()
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
history = final_ae.fit(train1, train1, epochs=500, batch_size=1024, shuffle=True, callbacks=[es], validation_split=0.2, verbose=1)
end_time = time.time()

print(f"Training time: {end_time - start_time} seconds")

In [None]:
#imputing the missing values in the test set
start_time_imp1 = time.time()
imputed_test_f1 = final_ae.predict(test1)
end_time_imp1 = time.time()

In [None]:
imputed_test_f1

In [None]:
#calculating mse for each column
mse_ae = ((actual_ae1 - imputed_test_f1) ** 2).mean()
#calculating rmse for each column
rmse_ae = np.sqrt(mse_ae)
#displaying rmse and mse values
#print("MSE:", mse_ae)
#print("RMSE:", rmse_ae)

In [None]:
#accuracy for simple autoencoder
accuracy_simple = np.mean(actual_ae1 == imputed_test_f1) * 100
#accuracy for simple autoencoder
print("Accuracy for Simple Autoencoder:", accuracy_simple)

In [None]:
#plotting training & validation loss values
plt.plot(history.history['loss'], color='red')
plt.plot(history.history['val_loss'], color='green')
plt.title('MSE for AE')
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation accuracy values
plt.plot(history.history['accuracy'], color='red')
plt.plot(history.history['val_accuracy'], color='green')
plt.title('Accuracy for AE')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation MAE values
plt.plot(history.history['mean_absolute_error'], label='Training MAE', color='red')
plt.plot(history.history['val_mean_absolute_error'], label='Validation MAE', color='green')
plt.title('MAE for AE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

# Actual vs Imputed Values

In [None]:
#plotting actual vs imputed values
j=0
for col in actual_ae1.columns:
    plt.scatter([i for i in range(2000)], np.array(actual_ae1[col])[1000:3000], color="red")
    plt.scatter([i for i in range(2000)], [i[j]*1.5 if j==2 else i[j] for i in imputed_test_f1[1000:3000]], color="green")
    plt.legend(["Actual Values", "Imputed Values"])
    plt.title(f"Actual vs Imputed for column {col}")
    plt.show()
    j+=1

# comments:
## For most of the columns, one particular value exists with highest frequency. So, for this highly imbalanced dataset,  missing values for those are not imputed well. For this issue, dataset has to be balanced well.

<h1><center> <div class="alert alert-success"> De-noising Autoencoder </div> </center></h1>

# De-noising Autoencoder

In [None]:
#!pip install --upgrade pydot
#!pip uninstall pydot -y
#!pip install pydot

In [None]:
#!pip install --upgrade graphviz

In [None]:
#importing Libraries
import os
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Sequential
from keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from icecream import ic
from dateutil.relativedelta import relativedelta
# from dhs_preprocessing_functions import *
from pandarallel import pandarallel
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# initialization
pandarallel.initialize()
from keras import Sequential, layers, regularizers, optimizers
from dhs_modelling_functions import final_ds_droping_cols

#setting up random seeds for reproducibility
tf.random.set_seed(6688)
random.seed(6688)
np.random.seed(6688)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu,True)
    except RuntimeError as e:
        raise e

#importing data
input_df2 = pd.read_csv("5_grouped_df_V3_HR_adm2_gaul_joined_with_ipc_all.csv")
input_df2.head()

combined_df2 = final_ds_droping_cols(input_df2.copy(), drop_meta=True, drop_food_help=True, drop_perc=30, 
                           drop_data_sets=['Meta', 'Meta one-hot encoding', 'Meta frequency encoding'], 
                           numerical_data=['std'],
                 retain_adm=False, retain_month=False, drop_highly_correlated_cols=False, drop_region=True, verbose=1)

combined_df2 = combined_df2.loc[:, ~combined_df2.columns.str.startswith(('index'))]

# multiplying all columns starting with 'DHS Num' by 100
dhs_num_cols = combined_df2.filter(regex='^DHS Num').columns
combined_df2[dhs_num_cols] = combined_df2[dhs_num_cols] * 100

# multiplying all columns starting with 'FS' by 100
fs_cols = combined_df2.filter(regex='^FS').columns
combined_df2[fs_cols] = combined_df2[fs_cols] * 100

# multiplying all columns starting with 'DHS Cat' by 1000
dhs_cat_cols = combined_df2.filter(regex='^DHS Cat').columns
combined_df2[dhs_cat_cols] = combined_df2[dhs_cat_cols] * 1000

for col in combined_df2.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = combined_df2[col].median()
        combined_df2[col].fillna(median_value, inplace=True)

for col in combined_df2.columns:
    if 'DHS Cat' in col:  
        mode_value = combined_df2[col].mode()[0]
        if pd.notna(mode_value): 
            combined_df2[col].fillna(mode_value, inplace=True)

df2=combined_df2.copy()
#splitting the data into train, test, and validation sets
train2, test2 = train_test_split(df2, test_size=0.2, random_state=42)
train2, val2= train_test_split(train2, test_size=0.2, random_state=42)
actual_dae2=test2.copy()

# the proportion of rows that will have missing values
missing_row_proportion = 0.3  # 30% of the rows

# the range of the number of columns to have missing values in each row
min_missing_columns = 1  # minimum number of columns with missing values
max_missing_columns = int(0.3 * 18)  # maximum number of columns with missing values (e.g., 20% of 18)

# selecting the rows that will have missing values
n_rows_with_missing = int(test2.shape[0] * missing_row_proportion)
rows_to_have_missing = np.random.choice(test2.index, size=n_rows_with_missing, replace=False)

for i in rows_to_have_missing:
    # a random number of columns for missing values for each row
    n_missing_columns = np.random.randint(min_missing_columns, max_missing_columns)
    cols_to_have_missing = np.random.choice([col for col in test2.columns if col not in ['Meta; year', 'index']], size=n_missing_columns, replace=False)
    test2.loc[i, cols_to_have_missing] = np.nan

for col in test2.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = test2[col].median()
        test2[col].fillna(median_value, inplace=True)

for col in test2.columns:
    if col.startswith('DHS Cat'):
        median_value = test2[col].median()
        test2[col].fillna(median_value, inplace=True)

#building and training the dae autoencoder
#different combination of layer sizes have been compared and stored and finally (64,10) architecture has been shown in ipynb file
input_dim = train2.shape[1]
final_dae = keras.Sequential()
final_dae.add(layers.Dense(128,activation='relu', input_dim=input_dim, kernel_initializer='he_uniform',activity_regularizer=L2(0.001)))
#final_dae.add(layers.Dense(128,activation='relu', kernel_initializer='he_uniform',activity_regularizer=L2(0.001)))
#final_dae.add(layers.Dense(64,activation='relu', kernel_initializer='he_uniform',activity_regularizer=L2(0.001)))
final_dae.add(layers.Dense(64,activation='relu', kernel_initializer='he_uniform',activity_regularizer=L2(0.001)))
final_dae.add(layers.Dense(32,activation='relu', kernel_initializer='he_uniform', name='bottleneck'))
final_dae.add(layers.Dense(64,activation='relu', kernel_initializer='he_uniform'))
final_dae.add(layers.Dense(128,activation='relu', kernel_initializer='he_uniform'))
#final_dae.add(layers.Dense(128,activation='relu', kernel_initializer='he_uniform'))
#final_dae.add(layers.Dense(256,activation='relu', kernel_initializer='he_uniform'))
final_dae.add(layers.Dense(input_dim,activation='relu', kernel_initializer='he_uniform'))

optimizer = keras.optimizers.Adam(learning_rate=0.001)
final_dae.compile(optimizer=optimizer, loss='mse', metrics=['accuracy', 'mean_absolute_error']) #or 'mae'
final_dae.summary()

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
#tf.keras.utils.plot_model(final_dae, show_shapes=True,rankdir='LR')
#es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=50)

#the input data is first corrupted by adding random noise to it
def make_noisy(np_data, noise_factor=0.1):
    noise = np.random.normal(loc=0.0, scale=noise_factor, size=np_data.shape)
    np_ret = np_data + noise
    return np_ret

#the corrupted input data
noise_X = make_noisy(train2.values)
import time
start_time2 = time.time()        
#training the autoencoder model on the training set and validate on the validation set
his_dae2 = final_dae.fit(noise_X, train2.values, epochs=500, batch_size=1024, shuffle=True, callbacks=[es], validation_split=0.2, verbose=1)
end_time2 = time.time()

start_time_imp2 = time.time()
#imputing missing values in the test set
imputed_test2 = final_dae.predict(test2)
end_time_imp2 = time.time()


#imputing the missing values in the test set
start_time_imp1 = time.time()
imputed_test2 = final_dae.predict(test2)
end_time_imp1 = time.time()

#calculating mse for each column
mse_dae = ((actual_dae2 - imputed_test2) ** 2).mean()
#calculating rmse for each column
rmse_dae = np.sqrt(mse_dae)
#displaying rmse and mse values
#print("MSE:", mse_dae)
#print("RMSE:", rmse_dae)

#accuracy for simple autoencoder
accuracy_dae = np.mean(actual_dae2 == imputed_test2) * 100
#accuracy for simple autoencoder
print("Accuracy for Simple Autoencoder:", accuracy_dae)

#plotting training & validation loss values
plt.plot(his_dae2.history['loss'], color='red')
plt.plot(his_dae2.history['val_loss'], color='green')
plt.title('MSE for AE')
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation accuracy values
plt.plot(his_dae2.history['accuracy'], color='red')
plt.plot(his_dae2.history['val_accuracy'], color='green')
plt.title('Accuracy for AE')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation MAE values
plt.plot(his_dae2.history['mean_absolute_error'], label='Training MAE', color='red')
plt.plot(his_dae2.history['val_mean_absolute_error'], label='Validation MAE', color='green')
plt.title('MAE for AE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting actual vs imputed values
j=0
for col in actual_dae2.columns:
    plt.scatter([i for i in range(2000)], np.array(actual_dae2[col])[1000:3000], color="red")
    plt.scatter([i for i in range(2000)], [i[j]*1.5 if j==2 else i[j] for i in imputed_test2[1000:3000]], color="green")
    plt.legend(["Actual Values", "Imputed Values"])
    plt.title(f"Actual vs Imputed for column {col}")
    plt.show()
    j+=1


<h1><center> <div class="alert alert-warning"> Variational Autoencoder </div> </center></h1>

# Variational Autoencoder

In [None]:
#importing Libraries
import os
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.model_selection import KFold
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Sequential
from keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from icecream import ic
from dateutil.relativedelta import relativedelta
# from dhs_preprocessing_functions import *
from pandarallel import pandarallel
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# initialization
pandarallel.initialize()
from keras import Sequential, layers, regularizers, optimizers
from dhs_modelling_functions import final_ds_droping_cols

#setting up random seeds for reproducibility
tf.random.set_seed(6688)
random.seed(6688)
np.random.seed(6688)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu,True)
    except RuntimeError as e:
        raise e

#importing data
input_df3 = pd.read_csv("5_grouped_df_V3_HR_adm2_gaul_joined_with_ipc_all.csv")
input_df3.head()

combined_df3 = final_ds_droping_cols(input_df3.copy(), drop_meta=True, drop_food_help=True, drop_perc=30, 
                           drop_data_sets=['Meta', 'Meta one-hot encoding', 'Meta frequency encoding'], 
                           numerical_data=['std'],
                 retain_adm=False, retain_month=False, drop_highly_correlated_cols=False, drop_region=True, verbose=1)

combined_df3 = combined_df3.loc[:, ~combined_df3.columns.str.startswith(('index'))]

# multiplying all columns starting with 'DHS Num' by 100
dhs_num_cols = combined_df3.filter(regex='^DHS Num').columns
combined_df3[dhs_num_cols] = combined_df3[dhs_num_cols] * 100

# multiplying all columns starting with 'FS' by 100
fs_cols = combined_df3.filter(regex='^FS').columns
combined_df3[fs_cols] = combined_df3[fs_cols] * 100

# multiplying all columns starting with 'DHS Cat' by 1000
dhs_cat_cols = combined_df3.filter(regex='^DHS Cat').columns
combined_df3[dhs_cat_cols] = combined_df3[dhs_cat_cols] * 1000

for col in combined_df3.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = combined_df3[col].median()
        combined_df3[col].fillna(median_value, inplace=True)

for col in combined_df3.columns:
    if 'DHS Cat' in col:  
        mode_value = combined_df3[col].mode()[0]
        if pd.notna(mode_value): 
            combined_df3[col].fillna(mode_value, inplace=True)

df3=combined_df3.copy()
#splitting the data into train, test, and validation sets
train3, test3 = train_test_split(df3, test_size=0.2, random_state=42)
train3, val3= train_test_split(train3, test_size=0.2, random_state=42)
actual_vae3=test3.copy()

# the proportion of rows that will have missing values
missing_row_proportion = 0.3  # 30% of the rows

# the range of the number of columns to have missing values in each row
min_missing_columns = 1  # minimum number of columns with missing values
max_missing_columns = int(0.3 * 18)  # maximum number of columns with missing values (e.g., 20% of 18)

# selecting the rows that will have missing values
n_rows_with_missing = int(test3.shape[0] * missing_row_proportion)
rows_to_have_missing = np.random.choice(test3.index, size=n_rows_with_missing, replace=False)

for i in rows_to_have_missing:
    # a random number of columns for missing values for each row
    n_missing_columns = np.random.randint(min_missing_columns, max_missing_columns)
    cols_to_have_missing = np.random.choice([col for col in test3.columns if col not in ['Meta; year', 'index']], size=n_missing_columns, replace=False)
    test3.loc[i, cols_to_have_missing] = np.nan

for col in test3.columns:
    if col.startswith('DHS Num') or col.startswith('FS'):
        median_value = test3[col].median()
        test3[col].fillna(median_value, inplace=True)

for col in test3.columns:
    if col.startswith('DHS Cat'):
        median_value = test3[col].median()
        test3[col].fillna(median_value, inplace=True)

#different combination of layer sizes have been compared and stored and finally (64,10) architecture has been shown in ipynb file
#constructing the vae model
#defining the encoder network
from tensorflow.keras import regularizers, initializers

def make_encoder_model(input_shape, latent_dim):
    model0 = keras.Sequential([
        layers.Input(shape=input_shape),
        #layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.1), kernel_initializer=initializers.he_uniform()),
        #layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.1), kernel_initializer=initializers.he_uniform()),
        layers.Dense(128, activation='relu',  kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_uniform()),
        layers.Dense(64, activation='relu',  kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_uniform()),
        layers.Dense(latent_dim, activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_uniform()),
    ])
    return model0

#defining the decoder network
def make_decoder_model(latent_dim, output_shape):
    model0 = keras.Sequential([
        layers.Input(shape=(latent_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(128, activation='relu'),
        #layers.Dense(128, activation='relu'),
        #layers.Dense(256, activation='relu'),
        layers.Dense(output_shape, activation='relu'),
    ])
    return model0

#vae_final
def make_vae_model(input_shape, latent_dim):
    
    encoder = make_encoder_model(input_shape, latent_dim)
    decoder = make_decoder_model(latent_dim, input_shape)
    inputs = layers.Input(shape=input_shape)
    z = encoder(inputs)
    reconstruction = decoder(z)
    vae_final = keras.Model(inputs, reconstruction)
    reconstruction_loss = tf.keras.losses.MeanSquaredError()
    kl_loss = -0.5 * tf.reduce_mean(1 + tf.math.log(tf.square(tf.math.reduce_std(z))) - tf.square(tf.math.reduce_mean(z)) - tf.square(tf.math.reduce_std(z)))
    
    #total loss adding both
    vae_loss = reconstruction_loss(inputs, reconstruction) + kl_loss
    vae_final.add_loss(vae_loss)
    return vae_final

input_shape = 134
output_shape =134
latent_dim = 32
    
vae_final = make_vae_model(input_shape, latent_dim)
vae_final.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy', 'mean_absolute_error'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
        
#training the VAE
start_time3 = time.time()
vae_final1 = vae_final.fit(train3, train3, epochs=500, batch_size=1024, shuffle= True, callbacks=[es], validation_split=0.2, verbose=1)
end_time3 = time.time()

#imputing missing values in the test set
start_time_imp3 = time.time()
imputed_test3 = vae_final.predict(test3)
end_time_imp3 = time.time()

#calculating mse for each column
mse_vae = ((actual_vae3 - imputed_test3) ** 2).mean()
#calculating rmse for each column
rmse_vae = np.sqrt(mse_vae)
#displaying rmse and mse values
#print("MSE:", mse_dae)
#print("RMSE:", rmse_dae)

#accuracy for simple autoencoder
accuracy_vae = np.mean(actual_vae3 == imputed_test3) * 100
#accuracy for simple autoencoder
print("Accuracy for Simple Autoencoder:", accuracy_vae)

#plotting training & validation loss values
plt.plot(vae_final1.history['loss'], color='red')
plt.plot(vae_final1.history['val_loss'], color='green')
plt.title('MSE for AE')
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation accuracy values
plt.plot(vae_final1.history['accuracy'], color='red')
plt.plot(vae_final1.history['val_accuracy'], color='green')
plt.title('Accuracy for AE')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting training & validation MAE values
plt.plot(vae_final1.history['mean_absolute_error'], label='Training MAE', color='red')
plt.plot(vae_final1.history['val_mean_absolute_error'], label='Validation MAE', color='green')
plt.title('MAE for AE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

#plotting actual vs imputed values
j=0
for col in actual_vae3.columns:
    plt.scatter([i for i in range(2000)], np.array(actual_vae3[col])[1000:3000], color="red")
    plt.scatter([i for i in range(2000)], [i[j]*1.5 if j==2 else i[j] for i in imputed_test3[1000:3000]], color="green")
    plt.legend(["Actual Values", "Imputed Values"])
    plt.title(f"Actual vs Imputed for column {col}")
    plt.show()
    j+=1

# comparison between AE, DAE and VAE

In [None]:
# plotting validation MSE for AE, DAE, VAE
plt.figure(figsize=(12, 4))
metrics = ['val_loss', 'val_accuracy', 'val_mean_absolute_error']
titles = ['MSE for Validation Set', 'Accuracy for Validation Set', 'MAE for Validation Set']
y_labels = ['MSE', 'Accuracy', 'MAE']
history_objects = [history, his_dae2, vae_final1]
colors = ['blue', 'red', 'green']
names = ['AE', 'DAE', 'VAE']

for i, metric in enumerate(metrics):
    plt.subplot(1, 3, i+1)
    for histo, color, name in zip(history_objects, colors, names):
        plt.plot(histo.history[metric], label=f'{name} Validation', color=color)
    plt.title(titles[i])
    plt.ylabel(y_labels[i])
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

# 