In [1]:
import numpy as np
import pandas as pd
from src.algorithms import * 
from src.utils import * 
from src.metrics_FAMD import *

In [2]:
gbsg = pd.read_csv("../GBSG2")

# On a un jeu de données complet: 
np.where(gbsg.isna()==True)

(array([], dtype=int64), array([], dtype=int64))

On détermine $I$ et $J$: 

In [3]:
I, J = gbsg.shape

Trouver $K_1$, $K_2$

In [4]:
# Categorical Variables :
idx_k2_gbsg = pd.Index(["meno","hormon","status"])

# Continuous Variables
idx_k1_gbsg = gbsg.columns.difference(idx_k2_gbsg)

In [5]:
# Inject missing values into the gbsg dataframe :
proba_non_missing = 0.8
gbsg_missing = create_missingness(gbsg, proba_non_missing)

# Encode dummy variables in gbsg dataframe and in the gbsg dataframe with missing values :
gbsg_missing_dummy, idx_j_gbsg, nb_values_per_cat_gbsg = encode_dummy_variables(gbsg_missing, idx_k2_gbsg)
gbsg_dummy = encode_dummy_variables(gbsg, idx_k2_gbsg)[0]

gbsg_missing_dummy

Unnamed: 0,pid,age,size,grade,nodes,pgr,er,rfstime,meno_0,meno_1,hormon_0,hormon_1,status_0,status_1
0,132,,18,2,,0,,1838,1,0,1,0,1,0
1,1575,55,20,3,16,0,0,403,0,1,1,0,,
2,1140,56,40,3,,0,0,1603,0,1,1,0,1,0
3,769,45,25,3,1,0,,177,,,1,0,1,0
4,130,,,,,0,36,1855,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,,51,30,3,2,1152,38,1760,1,0,0,1,1,0
682,1273,64,,2,2,1356,1144,,,,0,1,1,0
683,1525,57,35,,,1490,209,1342,0,1,,,1,0
684,,44,,2,3,1600,70,629,1,0,1,0,1,0


In [6]:
#Check the proportion of missing data :
n_missing = gbsg_missing.isna().sum().sum()
n_missing/(gbsg_missing.shape[0]*gbsg_missing.shape[1])

0.21137026239067055

In [7]:
#FAMD :
famd_algo = FAMD(data=gbsg_missing_dummy, k1=idx_k1_gbsg, k2=idx_j_gbsg)
#famd_algo.run_famd()

In [8]:
#IFAMD :

n_it = 1000
ifamd_gbsg = IterativeFAMDImputer(n_components=4, data=gbsg_missing_dummy, k1=idx_k1_gbsg, k2=idx_j_gbsg, nb_values_per_cat = nb_values_per_cat_gbsg)
ifamd_gbsg.impute(n_it)

df = ifamd_gbsg.df

Maximum iterations reached


#### To ensure that we are only considering TPR and NRMSE scores on imputed data: 

In [9]:
C0_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k1].to_numpy()
Categ_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k2].to_numpy()

#### Falsely classified Rate: 

In [10]:
# We encode categories into 0,1
res = (df[ifamd_gbsg.k2].copy()>=0.5).astype(int)

per_categ = metric_fc(res[Categ_missing], gbsg_dummy[ifamd_gbsg.k2][Categ_missing])
rate = per_categ.mean()

In [11]:
rate

0.1677753141167775

#### NRMSE

In [12]:
compute_nrmse_weighted(df[ifamd_gbsg.k1][C0_missing].astype(int), gbsg_dummy[ifamd_gbsg.k1][C0_missing])

0.07364459581144753