### 0 - Library importation

In [1]:
import numpy as np
import pandas as pd
from src.algorithms import * 
from src.utils import * 
from src.metrics_FAMD import *

### 1- Initialization

#### 1.1 - Load dataset

In [2]:
gbsg = pd.read_csv("../GBSG2")

np.where(gbsg.isna()==True) # Check that there are no missing values 

(array([], dtype=int64), array([], dtype=int64))

#### 1.2- Definition of parameters

In [3]:
# Shape of dataset (before dummy transformation)
I, J = gbsg.shape

# Categorical Variables :
idx_k2_gbsg = pd.Index(["meno","hormon","status", "grade"])

# Continuous Variables
idx_k1_gbsg = gbsg.columns.difference(idx_k2_gbsg)

#### 1.3- Inject missing values

In [4]:
proba_non_missing = 0.8
gbsg_missing = create_missingness(gbsg, proba_non_missing)

#### 1.4 - Encode dummy variables

In [5]:
# Complete dataframe
gbsg_missing_dummy, idx_j_gbsg, nb_values_per_cat_gbsg = encode_dummy_variables(gbsg_missing, idx_k2_gbsg)

# Dataframe with missing values
gbsg_dummy = encode_dummy_variables(gbsg, idx_k2_gbsg)[0]

gbsg_missing_dummy

Unnamed: 0,pid,age,size,nodes,pgr,er,rfstime,meno_0,meno_1,hormon_0,hormon_1,status_0,status_1,grade_2,grade_3,grade_1
0,132,49,18,2,0,,1838,1,0,1,0,1,0,1,0,0
1,,55,,16,0,,,0,1,,,0,1,0,1,0
2,,56,40,3,0,0,,0,1,1,0,1,0,0,1,0
3,,45,25,1,0,,177,1,0,1,0,1,0,0,1,0
4,130,65,30,5,0,36,,0,1,,,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,586,51,30,,1152,38,1760,,,0,1,1,0,0,1,0
682,,64,26,2,1356,1144,,,,0,1,1,0,1,0,0
683,1525,57,35,1,1490,209,1342,0,1,0,1,1,0,0,1,0
684,736,44,21,3,1600,70,629,1,0,1,0,1,0,1,0,0


In [6]:
#Check proportion of missing data :
n_missing = gbsg_missing.isna().sum().sum()
n_missing/(gbsg_missing.shape[0]*gbsg_missing.shape[1])

0.19374503047972436

### 2- Implement iterative FAMD

In [7]:
# Maximum number of iterations 
n_it = 1000

In [9]:
# Class definition

ifamd_gbsg = IterativeFAMDImputer(n_components=4, data=gbsg_missing_dummy, k1=idx_k1_gbsg, k2=idx_j_gbsg, nb_values_per_cat = nb_values_per_cat_gbsg)

In [10]:
# Run iterative FAMD
ifamd_gbsg.impute(n_it)

df = ifamd_gbsg.df

Converged in 54


### 3 - Metric scores

To ensure that we are only considering TPR and NRMSE scores on imputed data, we will compute the scores among imputed values

In [11]:
C0_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k1].to_numpy()
Categ_missing = gbsg_missing_dummy.isna()[ifamd_gbsg.k2].to_numpy()

#### Falsely classified Rate: 

In [12]:
# We encode categories into 0,1
res = (df[ifamd_gbsg.k2].copy()>=0.5).astype(int)

per_categ = metric_fc(res[Categ_missing], gbsg_dummy[ifamd_gbsg.k2][Categ_missing])
rate = per_categ.mean()

print("rate of falsely classified values: ", rate)

rate of falsely classified values:  0.11582720241463873


#### NRMSE

In [13]:
nrsme = compute_nrmse_weighted(df[ifamd_gbsg.k1][C0_missing].astype(int), gbsg_dummy[ifamd_gbsg.k1][C0_missing])

print("normalized rmse: ", nrsme)

normalized rmse:  0.25125276247459283
