This notebook aims to present a gentle introduction to the model's functionalities, pre-processings, computation of iFAMD and metrics. Such introduction will be held on a synthetically generated dataset. 

### 0 - Library importation

In [1]:
import numpy as np
import pandas as pd
from src.algorithms import * 
from src.utils import * 
from src.metrics_FAMD import *

### 1- Initialization

#### 1.1 - Let's generate a synthetic dataset! 

In [5]:
np.random.seed(21032024) # Set random seed to create reproductible results

# Defintion of parameter for synthetic dataset 

n = 100 #  Number of samples 
S = 2  # Underlying dimensions
K = [1,3]  # K[s] = number of times the variable s (s in {1,...,S}) is duplicated in the dataset
cat = 2 # Number of categorical variables
cat_idx = [1,2] # Index of the categorical variables
nb_of_cat_per_var = [4,4] # Number of categories for each categorical variable
SNR = 3 # Signal noise ratio

In [6]:
# Creation of synthetic dataset
df_ref = create_dataset(n, S, K, cat, cat_idx, nb_of_cat_per_var, SNR)

np.where(df_ref.isna()==True) # Check that there are no missing values 

(array([], dtype=int64), array([], dtype=int64))

#### 1.2- Definition of parameters

In [15]:
# Shape of dataset (before dummy transformation)
I, J = df_ref.shape

# Categorical Variables :
idx_k2 = pd.Index(cat_idx).map(str)

# Continuous Variables
idx_k1 = df_ref.columns.difference(idx_k2)

#### 1.3- Inject missing values

In [17]:
proba_non_missing = 0.8
dfref_missing = create_missingness(df_ref, proba_non_missing)

#### 1.4 - Encode dummy variables

In [18]:
# Complete dataframe
df_missing_dummy, idx_j, nb_values_per_cat = encode_dummy_variables(dfref_missing, idx_k2)

# Dataframe with missing values
dfref_dummy = encode_dummy_variables(df_ref, idx_k2)[0]

df_missing_dummy

Unnamed: 0,0,3,4,5,1_3.0,1_1.0,1_2.0,1_0.0,2_1.0,2_2.0,2_3.0,2_0.0
0,,-1.355385,-1.48824,,1,0,0,0,,,,
1,0.883629,0.70227,1.450796,,,,,,1,0,0,0
2,-1.117298,-0.406702,-0.39529,-0.523693,0,1,0,0,0,1,0,0
3,-0.558345,-0.916434,,,0,0,1,0,0,0,1,0
4,0.053069,-1.825799,,-1.6404,,,,,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.535187,1.615637,1.518405,1.472676,0,1,0,0,0,0,0,1
96,-0.458293,0.414979,0.640392,0.669035,0,1,0,0,0,0,0,1
97,0.778979,-1.446206,,-1.644596,0,0,0,1,0,0,1,0
98,-0.294975,-0.011132,-0.538607,0.24137,1,0,0,0,0,0,0,1


In [19]:
#Check proportion of missing data :
n_missing = dfref_missing.isna().sum().sum()
n_missing/(dfref_missing.shape[0]*dfref_missing.shape[1])

0.19833333333333333

### 2- Implement iterative FAMD

In [20]:
# Maximum number of iterations 
n_it = 1000

In [21]:
# Class definition
ifamd = IterativeFAMDImputer(n_components=4, data=df_missing_dummy, k1=idx_k1, k2=idx_j, nb_values_per_cat = nb_values_per_cat)

In [22]:
# Run iterative FAMD
ifamd.impute(n_it)

df = ifamd.df

Converged in 6


### 3 - Metric scores

To ensure that we are only considering TPR and NRMSE scores on imputed data, we will compute the scores among imputed values

In [23]:
C0_missing = df_missing_dummy.isna()[ifamd.k1].to_numpy()
Categ_missing = df_missing_dummy.isna()[ifamd.k2].to_numpy()

#### Falsely classified Rate: 

In [24]:
# We encode categories into 0,1
res = (df[ifamd.k2].copy()>=0.5).astype(int)

per_categ = metric_fc(res[Categ_missing], dfref_dummy[ifamd.k2][Categ_missing])
rate = per_categ.mean()

print("rate of falsely classified values: ", rate)

rate of falsely classified values:  0.17567567567567566


#### NRMSE

In [25]:
nrsme = compute_nrmse_weighted(df[ifamd.k1][C0_missing].astype(int), dfref_dummy[ifamd.k1][C0_missing])

print("normalized rmse: ", nrsme)

normalized rmse:  0.28337438646756036
