In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

from src.utils import *
from src.algorithms import * 
from src.metrics_FAMD import *

import warnings

In [2]:
# Fonction to compute metrics for the generated datasets
def compute_metrics(df, cat_idx, n_it, n_components, proba_non_missing):
    """ Compute metrics for different missing probabilities     
    Args: 
        df (pd.DataFrame): Dataframe to  
        proba_non_missing (list of float) : List of probabilities that a value is not missing

    Returns:
        data_missing_raw (pd.DataFrame): Masked dataframe hence containing missing values 

    """
    # Categorical Variables :
    idx_k2 = pd.Index(cat_idx)
    # Continuous Variables
    idx_k1 = df.columns.difference(idx_k2)

    dict_dfs = {}
    for p in proba_non_missing: 
        df_missing = create_missingness(df, p)
        
        # Encode dummy variables in the dataframe and in the dataframe with missing values :
        df_missing_dummy, idx_j, nb_values_per_cat_df = encode_dummy_variables(df_missing, idx_k2)
        df_dummy = encode_dummy_variables(df, idx_k2)[0]
        dict_dfs.update({p:[idx_k1, idx_j, df_missing_dummy, df_dummy, nb_values_per_cat_df]})
        #print("proba non missing: " f'{p}', "missingness rate: ",df_missing.isna().sum().sum()/(df_missing.shape[0]*df_missing.shape[1]))

    #IFAMD
    fc_rate = []
    nmrse = []

    for p,values in dict_dfs.items(): 
        k1, k_j, df_missing, df_true, nb_val_per_car = values
        C0_missing, Categ_missing = df_missing.isna()[k1].to_numpy(), df_missing.isna()[k_j].to_numpy()  
        
        #Computation of iterative FAMD
        ifamd_df = IterativeFAMDImputer(n_components=n_components, data=df_missing, k1=k1, k2=k_j, nb_values_per_cat = nb_val_per_car)
        ifamd_df.impute(n_it)
        df = ifamd_df.df


        # We encode categories into 0,1
        res = ifamd_df.df[ifamd_df.k2].copy()
        pos = 0
        for h in range (len(idx_k2)) :
            col = [idx_j[pos+i] for i in range (nb_values_per_cat_df[h])]
            res["max_value"] = ifamd_df.df[col].max(axis = 1)
            for value in col:
                res[value] = (res[value] == res["max_value"]).astype(int)
            pos += nb_values_per_cat_df[h]
        res = res[ifamd_df.k2] 

        #Compute metrics 
        fc_rate.append(metric_fc(res[Categ_missing], df_true[k_j][Categ_missing]))

        # For continuous variables: 
        nmrse.append(compute_nrmse_weighted(df[k1][C0_missing], df_true[k1][C0_missing]))

    fc_rate = np.array(fc_rate)
    nmrse = np.array(nmrse)

    return fc_rate, nmrse, idx_j


In [3]:
# Set random seed to create reproductible results
np.random.seed(21032024)

In [4]:
#Parameters of the first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

n = 100 # Sample size
S = 2  # 
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

df_3_1_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 1)
df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 3)

Initialisation

In [5]:
#Define number of iterations 
n_it = 1000
n_components = 3

# Inject missing values into the dataframe :
proba_non_missing = [0.7, 0.8, 0.9]

In [6]:
n_simulations = 20

In [7]:
S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

mean_fc_rate_list_3_1_snr3 = []
rnmse_list_3_1_snr3 = []

for n_sim in range(n_simulations):
    df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 3)
    fc_rate, nmrse, idx_j = compute_metrics(df_3_1_snr3, pd.Index(cat_idx).map(str), n_it, n_components, proba_non_missing)
    mean_fc_rate = [np.mean(fc_rate[i]) for i in range(len(proba_non_missing))]
    rnmse_list_3_1_snr3.append(nmrse)
    mean_fc_rate_list_3_1_snr3.append(mean_fc_rate)

Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2
Converged in 2


In [8]:
mean_fc_rate_list_3_1_snr3

[[0.25, 0.2692307692307692, 0.26136363636363635],
 [0.33064516129032256, 0.3026315789473684, 0.3076923076923077],
 [0.2675438596491228, 0.2717391304347826, 0.26041666666666663],
 [0.26315789473684215, 0.22857142857142856, 0.2647058823529411],
 [0.2943548387096774, 0.24358974358974358, 0.30625],
 [0.28515625, 0.2426470588235294, 0.3035714285714286],
 [0.2579365079365079, 0.28125, 0.25],
 [0.2916666666666667, 0.2777777777777778, 0.26136363636363635],
 [0.25943396226415094, 0.30000000000000004, 0.25],
 [0.31, 0.29583333333333334, 0.2884615384615385],
 [0.2653846153846154, 0.28888888888888886, 0.3577586206896552],
 [0.2619047619047619, 0.34444444444444444, 0.2258064516129032],
 [0.3035714285714286, 0.2441860465116279, 0.3],
 [0.2647058823529412, 0.2894736842105263, 0.2894736842105263],
 [0.25735294117647056, 0.24166666666666667, 0.30000000000000004],
 [0.28409090909090906, 0.29069767441860467, 0.24],
 [0.2958333333333333, 0.26111111111111107, 0.28125],
 [0.3194444444444445, 0.2797619047619

In [9]:
rnmse_list_3_1_snr3

[array([9.26987327e+07, 1.19036329e+08, 3.34351001e+07]),
 array([1.64932710e+08, 6.35629320e+07, 1.19316007e+08]),
 array([1.10747464e+08, 3.89870850e+07, 5.37069432e+07]),
 array([38027909.76484077, 41985056.98693085, 90248595.30150166]),
 array([2.56605762e+07, 1.60431047e+08, 1.49232366e+08]),
 array([69149674.78995372, 80092921.8872097 , 43579089.66499741]),
 array([23215381.2935992 , 98311288.50957365, 40875325.3728473 ]),
 array([2.53680348e+07, 1.25695542e+08, 8.45542258e+07]),
 array([1.62362346e+08, 2.83996358e+07, 6.63640285e+07]),
 array([33596071.72729644, 78915601.42319325, 31208741.86409141]),
 array([6.30792613e+07, 1.68751002e+08, 1.30210740e+08]),
 array([1.29396549e+08, 1.25705819e+08, 4.77030291e+07]),
 array([6.84224936e+07, 8.46466306e+07, 1.38919056e+08]),
 array([1.37825500e+08, 8.07472888e+07, 3.92232367e+08]),
 array([5.09820817e+07, 7.81146912e+07, 3.25122028e+08]),
 array([5.25110570e+07, 6.96237682e+07, 1.69197501e+08]),
 array([62038830.45151004, 47709743.

In [None]:
fig, ax = plt.subplots(figsize=(20,5),ncols=4, sharey=True)
fig.suptitle('Imputation techniques accuracy for categories')

for i, vect in enumerate(mean_fc_rate_list_3_1_snr3): 
    ax[0].scatter(idx_j_3_1_snr1.to_numpy(), vect, label=proba_non_missing[i])
    ax[0].plot(idx_j_3_1_snr1.to_numpy(),np.ones_like(idx_j_3_1_snr1.to_numpy())*vect.mean())
    ax[0].set_ylabel("Falsely classified rate")
    ax[0].set_title("iterative FAMD on data with SNR 1")
    ax[0].legend()

for i, vect in enumerate(rnmse_list_3_1_snr3): 
    ax[1].scatter(idx_j_3_1_snr3.to_numpy(), vect, label=proba_non_missing[i])
    ax[1].plot(idx_j_3_1_snr3.to_numpy(),np.ones_like(idx_j_3_1_snr3.to_numpy())*vect.mean())
    ax[1].set_ylabel("Falsely classified rate")
    ax[1].set_title("iterative FAMD on data with SNR 1")
    ax[1].legend()

    

#fig.savefig('images/PFC_categories.png')
plt.show()

In [None]:
#Parameters of the second dataset created in the paper : (3.2.1 Linear and nonlinear relationships)

S = 1   
K = [5]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 3 #number of categorical variables
cat_idx = [3,4,5] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

In [None]:
df = df_3_1_snr3

cat_idx = pd.Index(cat_idx).map(str)
n_it = n_it
n_components =2
proba_non_missing = [0.7,0.8,0.9]

In [None]:
# Categorical Variables :
idx_k2 = pd.Index(cat_idx)
# Continuous Variables
idx_k1 = df.columns.difference(idx_k2)

dict_dfs = {}
for p in proba_non_missing: 
    df_missing = create_missingness(df, p)
        
    # Encode dummy variables in the dataframe and in the dataframe with missing values :
    df_missing_dummy, idx_j, nb_values_per_cat_df = encode_dummy_variables(df_missing, idx_k2)
    df_dummy = encode_dummy_variables(df, idx_k2)[0]
    dict_dfs.update({p:[idx_k1, idx_j, df_missing_dummy, df_dummy, nb_values_per_cat_df]})
    #print("proba non missing: " f'{p}', "missingness rate: ",df_missing.isna().sum().sum()/(df_missing.shape[0]*df_missing.shape[1]))


In [None]:
 #IFAMD
fc_rate = []
nmrse = []

for p,values in dict_dfs.items(): 
    k1, k_j, df_missing, df_true, nb_val_per_car = values
    C0_missing, Categ_missing = df_missing.isna()[k1].to_numpy(), df_missing.isna()[k_j].to_numpy()  
        
    #Computation of iterative FAMD
    ifamd_df = IterativeFAMDImputer(n_components=n_components, data=df_missing, k1=k1, k2=k_j, nb_values_per_cat = nb_val_per_car)
    ifamd_df.impute(n_it)
    df = ifamd_df.df
    print(df)

    # We encode categories into 0,1
    res = ifamd_df.df[ifamd_df.k2].copy()
    pos = 0
    for h in range (len(idx_k2)) :
        col = [idx_j[pos+i] for i in range (nb_values_per_cat_df[h])]
        res["max_value"] = ifamd_df.df[col].max(axis = 1)
        for value in col:
            res[value] = (res[value] == res["max_value"]).astype(int)
        pos += nb_values_per_cat_df[h]
    res = res[ifamd_df.k2] 

    #Compute metrics 
    print(metric_fc(res[Categ_missing], df_true[k_j][Categ_missing]))

    # For continuous variables: 
    print(compute_nrmse_weighted(df[k1][C0_missing].astype(int), df_true[k1][C0_missing]))

In [None]:

compute_nrmse_weighted(df[k1][C0_missing], df_true[k1][C0_missing])

In [None]:
weights = df_true[k1][C0_missing].sum(axis=0)
std_df = df_true[k1][C0_missing].std(axis=0)

In [None]:
nrmse_numerator = 0 
for c in df_true[k1][C0_missing].columns.to_numpy(): 
    nrmse_numerator+= (weights[c] * ((df_true[k1][C0_missing][c] - df[k1][C0_missing][c])/std_df[c])**2).sum()
# Compute NRMSE denominator
nrmse_denominator = np.sum(weights)
# Compute NRMSE
nrmse = np.sqrt(nrmse_numerator / nrmse_denominator)    