Let's generate synthetic datasets !

In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

from src.utils import *
from src.algorithms import * 
from src.metrics_FAMD import *

In [78]:
# Fonction to compute metrics for the generated datasets
def compute_metrics(df, cat_idx, n_it, n_components, proba_non_missing):
    """ Compute metrics for different missing probabilities     
    Args: 
        df (pd.DataFrame): Dataframe to  
        proba_non_missing (list of float) : List of probabilities that a value is not missing

    Returns:
        data_missing_raw (pd.DataFrame): Masked dataframe hence containing missing values 

    """
    # Categorical Variables :
    idx_k2 = pd.Index(cat_idx)
    # Continuous Variables
    idx_k1 = df.columns.difference(idx_k2)

    dict_dfs = {}
    for p in proba_non_missing: 
        df_missing = create_missingness(df, p)
        # Encode dummy variables in the dataframe and in the dataframe with missing values :
        df_missing_dummy, idx_j, nb_values_per_cat_df = encode_dummy_variables(df_missing, idx_k2)
        df_dummy = encode_dummy_variables(df, idx_k2)[0]
        dict_dfs.update({p:[idx_k1, idx_j, df_missing_dummy, df_dummy, nb_values_per_cat_df]})
        #print("proba non missing: " f'{p}', "missingness rate: ",df_missing.isna().sum().sum()/(df_missing.shape[0]*df_missing.shape[1]))

    #IFAMD
    fc_rate = []
    nmrse = []

    for p,values in dict_dfs.items(): 
        k1, k_j, df_missing, df_true, nb_val_per_car = values
        C0_missing, Categ_missing = df_missing.isna()[k1].to_numpy(), df_missing.isna()[k_j].to_numpy()  
        
        #Computation of iterative FAMD
        ifamd_df = IterativeFAMDImputer(n_components=n_components, data=df_missing, k1=k1, k2=k_j, nb_values_per_cat = nb_val_per_car)
        ifamd_df.impute(n_it)
        df = ifamd_df.df


        # We encode categories into 0,1
        res = ifamd_df.df[ifamd_df.k2].copy()
        pos = 0
        for h in range (len(idx_k2)) :
            col = [idx_j[pos+i] for i in range (nb_values_per_cat_df[h])]
            res["max_value"] = ifamd_df.df[col].max(axis = 1)
            for value in col:
                res[value] = (res[value] == res["max_value"]).astype(int)
            pos += nb_values_per_cat_df[h]
        res = res[ifamd_df.k2] 

        #Compute metrics 
        fc_rate.append(metric_fc(res[Categ_missing], df_true[k_j][Categ_missing]))

        # For continuous variables: 
        nmrse.append(compute_nrmse_weighted(df[k1][C0_missing].astype(int), df_true[k1][C0_missing]))

    fc_rate = np.array(fc_rate)
    nmrse = np.array(nmrse)

    return fc_rate, nmrse, idx_j


In [79]:
# Set random seed to create reproductible results
np.random.seed(21032024)

# Sample size :
n = 100

In [4]:
#Parameters of the first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

df_3_1_snr1 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 1)
df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 3)
#df_3_1_snr1.to_csv("datasets/df_3_1_snr1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
#df_3_1_snr3.to_csv("datasets/df_3_1_snr3.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

Initialisation

In [80]:
#Define number of iterations 
n_it = 1000
n_components = 8

# Inject missing values into the dataframe :
proba_non_missing = [0.7, 0.8, 0.9]

In [81]:
n_simulations = 20

In [82]:
S = 2   
K = [1,3]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 #number of categorical variables
cat_idx = [1,2] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

mean_fc_rate_list_3_1_snr3 = []
rnmse_list_3_1_snr3 = []

for n_sim in range(n_simulations):
    df_3_1_snr3 = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, 3)
    fc_rate, nmrse, idx_j = compute_metrics(df_3_1_snr3, pd.Index(cat_idx).map(str), n_it, n_components, proba_non_missing)
    mean_fc_rate = [np.mean(fc_rate[i]) for i in range(len(proba_non_missing))]
    rnmse_list_3_1_snr3.append(nmrse)
    mean_fc_rate_list_3_1_snr3.append(mean_fc_rate)



Converged in 22
Converged in 10
Converged in 5
Converged in 9
Converged in 9
Converged in 6
Converged in 12
Converged in 9
Converged in 5
Converged in 18
Converged in 14
Converged in 4
Converged in 16
Converged in 8
Converged in 4
Converged in 14
Converged in 15
Converged in 5
Converged in 14
Converged in 8
Converged in 6
Converged in 13
Converged in 9
Converged in 6


  nrmse = np.sqrt(nrmse_numerator / nrmse_denominator)


Converged in 10
Converged in 9
Converged in 5


  nrmse = np.sqrt(nrmse_numerator / nrmse_denominator)


Converged in 10
Converged in 10
Converged in 5
Converged in 14
Converged in 8
Converged in 5
Converged in 13
Converged in 10
Converged in 6
Converged in 14
Converged in 9
Converged in 7
Converged in 17
Converged in 8
Converged in 5
Converged in 14
Converged in 8
Converged in 6
Converged in 14
Converged in 15
Converged in 6
Converged in 11
Converged in 7
Converged in 5


IndexError: index 1 is out of bounds for axis 0 with size 1

In [83]:
mean_fc_rate_list_3_1_snr3

[[0.24107142857142858, 0.24285714285714285, 0.25],
 [0.2798507462686567, 0.20652173913043478, 0.20833333333333334],
 [0.225, 0.21621621621621623, 0.25],
 [0.23828125, 0.23369565217391303, 0.14423076923076922],
 [0.24206349206349206, 0.24999999999999997, 0.23076923076923078],
 [0.21428571428571427, 0.20833333333333331, 0.2],
 [0.21929824561403508, 0.2421875, 0.22222222222222224],
 [0.2543103448275862, 0.2361111111111111, 0.16999999999999998],
 [0.2379032258064516, 0.19444444444444445, 0.19117647058823528],
 [0.26492537313432835, 0.195, 0.21875],
 [0.2246376811594203, 0.21794871794871792, 0.25],
 [0.28787878787878785, 0.2717391304347826, 0.19047619047619047],
 [0.25462962962962965, 0.20714285714285713, 0.17857142857142855],
 [0.25396825396825395, 0.2134146341463415, 0.18055555555555552],
 [0.26102941176470584, 0.20833333333333331, 0.20588235294117646],
 [0.25, 0.25, 0.23214285714285715]]

In [84]:
rnmse_list_3_1_snr3

[array([8.21800631, 4.8099604 , 3.29206349]),
 array([8.28602728, 4.91995867, 3.63937399]),
 array([8.12660429, 6.46212016, 3.6963096 ]),
 array([ 8.99519589, 11.72256196,  3.88374781]),
 array([8.18742098, 5.06517235, 2.29525209]),
 array([7.74568497, 6.21914811, 3.9255391 ]),
 array([5.56500358, 6.46864019, 4.60545009]),
 array([5.5002604 ,        nan, 5.11135614]),
 array([8.10343588, 5.6077011 ,        nan]),
 array([8.60774128, 6.38604435, 4.79551877]),
 array([6.74762131, 5.18094344, 4.32727483]),
 array([7.86671997, 4.69928281, 3.29657117]),
 array([9.16106704, 6.46012257, 4.08551604]),
 array([8.70683613, 5.29165599, 3.33158773]),
 array([7.50765776, 5.9748232 , 3.97687402]),
 array([7.72567287, 5.12566095, 3.16162066])]

In [None]:
fig, ax = plt.subplots(figsize=(20,5),ncols=4, sharey=True)
fig.suptitle('Imputation techniques accuracy for categories')

for i, vect in enumerate(fc_rate_3_1_snr1): 
    ax[0].scatter(idx_j_3_1_snr1.to_numpy(), vect, label=proba_non_missing[i])
    ax[0].plot(idx_j_3_1_snr1.to_numpy(),np.ones_like(idx_j_3_1_snr1.to_numpy())*vect.mean())
    ax[0].set_ylabel("Falsely classified rate")
    ax[0].set_title("iterative FAMD on data with SNR 1")
    ax[0].legend()

for i, vect in enumerate(fc_rate_3_1_snr3): 
    ax[1].scatter(idx_j_3_1_snr3.to_numpy(), vect, label=proba_non_missing[i])
    ax[1].plot(idx_j_3_1_snr3.to_numpy(),np.ones_like(idx_j_3_1_snr3.to_numpy())*vect.mean())
    ax[1].set_ylabel("Falsely classified rate")
    ax[1].set_title("iterative FAMD on data with SNR 1")
    ax[1].legend()

    

#fig.savefig('images/PFC_categories.png')
plt.show()

In [None]:
#Parameters of the second dataset created in the paper : (3.2.1 Linear and nonlinear relationships)

S = 1   
K = [5]  #K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset

cat = 3 #number of categorical variables
cat_idx = [3,4,5] #index of the categorical variables
nb_of_cat_per_var = [4,4] #number of categories for each categorical variable

3.2.1 Linear and nonlinear relationships