In [2]:
import numpy as np
import pandas as pd
from src.utils import * 
from src.metrics_FAMD import *

In [3]:
gbsg = pd.read_csv("gbsg.csv")

# On a un jeu de données complet: 
np.where(gbsg.isna()==True)

(array([], dtype=int64), array([], dtype=int64))

On détermine $I$ et $J$: 

In [4]:
I, J = gbsg.shape

Trouver $K_1$, $K_2$

In [5]:
# Categorical Variables :
idx_k2_gbsg = pd.Index(["meno","hormon","status"])

# Continuous Variables
idx_k1_gbsg = gbsg.columns.difference(idx_k2_gbsg)

In [6]:
# Inject missing values into the gbsg dataframe :
proba_non_missing = 0.8
gbsg_missing = create_missingness(gbsg, proba_non_missing)

# Encode dummy variables in gbsg dataframe and in the gbsg dataframe with missing values :

gbsg_missing_dummy, idx_j_gbsg, nb_values_per_cat_gbsg = encode_dummy_variables(gbsg_missing, idx_k2_gbsg)
gbsg_dummy = encode_dummy_variables(gbsg, idx_k2_gbsg)[0]

gbsg_missing_dummy

Unnamed: 0.1,Unnamed: 0,pid,age,size,grade,nodes,pgr,er,rfstime,meno_0,meno_1,hormon_0,hormon_1,status_0,status_1
0,1,132,49,18,2,,0,,1838,1,0,1,0,1,0
1,2,1575,55,20,3,,0,,,0,1,1,0,0,1
2,3,1140,56,40,3,3,,0,1603,0,1,1,0,1,0
3,4,769,45,25,3,,0,4,177,1,0,,,1,0
4,5,130,65,30,2,5,0,,,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,,586,51,30,,2,1152,38,,,,,,1,0
682,683,1273,64,26,2,2,1356,1144,1152,,,0,1,1,0
683,684,1525,57,35,3,1,1490,209,1342,0,1,0,1,1,0
684,685,736,,21,2,3,1600,70,629,,,1,0,1,0


In [7]:
#Check the proportion of missing data :
n_missing = gbsg_missing.isna().sum().sum()
n_missing/(gbsg_missing.shape[0]*gbsg_missing.shape[1])

0.1890184645286686

In [8]:
#FAMD :

famd_algo = FAMD(data=gbsg_missing_dummy, k1=idx_k1_gbsg, k2=idx_j_gbsg)
#famd_algo.run_famd()

In [9]:
#IFAMD :

n_it = 100
ifamd_gbsg = IterativeFAMDImputer(n_components=4, data=gbsg_missing_dummy, k1=idx_k1_gbsg, k2=idx_j_gbsg, nb_values_per_cat = nb_values_per_cat_gbsg)
ifamd_gbsg.impute(n_it)

Maximum iterations reached


Unnamed: 0.1,Unnamed: 0,age,er,grade,nodes,pgr,pid,rfstime,size,meno_0,meno_1,hormon_0,hormon_1,status_0,status_1
0,1.000000,49.00000,98.731844,2.000000,4.784810,-180.246263,132.000000,1976.711513,18.000000,1.000000,0.000000,1.000000,0.000000,1.0,0.0
1,2.000000,55.00000,98.731844,3.000000,4.784810,-245.939250,1575.000000,1042.260750,24.759048,0.000000,1.000000,1.000000,0.000000,0.0,1.0
2,3.000000,56.00000,0.000000,3.000000,3.000000,118.078153,-1347.144335,1603.000000,40.000000,0.000000,1.000000,1.000000,0.000000,1.0,0.0
3,4.000000,45.00000,4.000000,3.000000,4.784810,-104.446238,769.000000,177.000000,25.000000,1.000000,0.000000,0.728463,0.271537,1.0,0.0
4,5.000000,65.00000,98.731844,2.000000,5.000000,0.000000,130.000000,447.829182,31.758113,0.000000,1.000000,0.000000,1.000000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,1547.513805,51.00000,38.000000,2.127698,3.627972,1152.000000,586.000000,1115.794376,41.499613,0.620125,0.379875,0.549728,0.450272,1.0,0.0
682,683.000000,64.00000,1144.000000,2.000000,2.000000,1356.000000,1273.000000,1152.000000,26.000000,-0.182759,1.182759,0.000000,1.000000,1.0,0.0
683,684.000000,57.00000,209.000000,3.000000,1.000000,1490.000000,1525.000000,1342.000000,35.000000,0.000000,1.000000,0.000000,1.000000,1.0,0.0
684,685.000000,53.34103,3103.125081,2.000000,3.000000,1600.000000,736.000000,629.000000,21.000000,-1.322981,2.322981,1.000000,0.000000,1.0,0.0


#### To ensure that we are only considering TPR and NRMSE scores on imputed data: 

In [10]:
C0_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k1].to_numpy()
Categ_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k2].to_numpy()

#### Falsely classified Rate: 

In [11]:
# We encode categories into 0,1
res = (ifamd_gbsg.df[ifamd_gbsg.k2].copy()>=0.5).astype(int)

metric_fc(res[Categ_missing], gbsg_dummy[ifamd_gbsg.k2][Categ_missing])

array([0.19354839, 0.19354839, 0.13709677, 0.13709677, 0.17204301,
       0.17204301])

#### NRMSE

In [12]:
compute_nrmse_weighted(gbsg_dummy[ifamd_gbsg.k1][C0_missing], ifamd_gbsg.df[ifamd_gbsg.k1][C0_missing])

27.566793878927406

## TEST on a generated Dataset

In [13]:
df = pd.read_csv("df_3_1.csv") # first dataset created in the paper : (3.1 Relationships between continuous and categorical variables)

print(df.shape)

(100, 6)


In [14]:
# Categorical Variables :
idx_k2_df = pd.Index(["1","2"])

# Continuous Variables
idx_k1_df = df.columns.difference(idx_k2_df)


In [15]:
# Inject missing values into the gbsg dataframe :
proba_non_missing = 0.8
df_missing = create_missingness(df, proba_non_missing)

# Encode dummy variables in gbsg dataframe and in the gbsg dataframe with missing values :
df_missing_dummy, idx_j_df, nb_values_per_cat_df = encode_dummy_variables(df_missing, idx_k2_df)
df_dummy = encode_dummy_variables(df, idx_k2_df)[0]

In [16]:
ifamd_df = IterativeFAMDImputer(n_components=4, data=df_missing_dummy, k1=idx_k1_df, k2=idx_j_df, nb_values_per_cat = nb_values_per_cat_df)
ifamd_df.impute(n_it)

Converged in 9


Unnamed: 0,0,3,4,5,1_0.0,1_1.0,1_2.0,1_3.0,2_0.0,2_2.0,2_3.0,2_1.0
0,0.093337,0.135113,0.167152,0.049261,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
1,-1.753978,1.046237,1.167125,1.105409,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
2,0.177369,0.414308,0.415502,0.471683,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
3,0.224164,0.327232,0.439984,0.327426,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000
4,1.775482,-0.293381,-0.434007,-0.561626,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.289194,-0.401571,-0.320624,-0.361769,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
96,-0.009185,-0.143916,-0.155891,-0.145247,-0.570963,0.555759,0.644035,0.371169,0.000000,0.000000,1.000000,0.000000
97,-1.009023,-0.264789,-0.186147,-0.259537,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
98,0.324053,-1.140274,-1.182114,-1.254136,0.000000,0.000000,0.000000,1.000000,0.492256,-0.174182,0.336643,0.345283


In [17]:
C0_missing = df_missing_dummy.isna()[ifamd_df.k1].to_numpy()
Categ_missing = df_missing_dummy.isna()[ifamd_df.k2].to_numpy()

# We encode categories into 0,1

res = ifamd_df.df[ifamd_df.k2].copy()
pos = 0
for h in range (len(idx_k2_df)) :
    col = [idx_j_df[pos+i] for i in range (nb_values_per_cat_df[h])]
    res["max_value"] = ifamd_df.df[col].max(axis = 1)
    for value in col:
        res[value] = (res[value] == res["max_value"]).astype(int)

    pos += nb_values_per_cat_df[h]

res = res[ifamd_df.k2] 

metric_fc(res[Categ_missing], df_dummy[ifamd_df.k2][Categ_missing])

array([0.23809524, 0.19047619, 0.33333333, 0.23809524, 0.28571429,
       0.26190476, 0.14285714, 0.4047619 ])

In [18]:
#NRMSE
compute_nrmse_weighted(df_dummy[ifamd_df.k1][C0_missing], ifamd_df.df[ifamd_df.k1][C0_missing])

11.051097827941053