In [1]:
import numpy as np
import pandas as pd
from src.utils import * 

In [2]:
gbsg = pd.read_csv("../GBSG2")

# On a un jeu de données complet: 
np.where(gbsg.isna()==True)

(array([], dtype=int64), array([], dtype=int64))

On détermine $I$ et $J$: 

In [3]:
I, J = gbsg.shape

Trouver $K_1$, $K_2$

In [4]:
# Variables continues
idx_k1 = gbsg.columns[(gbsg.isin([0, 1])==False).any()]

In [5]:
for i in gbsg.columns[(gbsg.isin([0, 1])).all()].to_numpy(): 
    gbsg[i+'_'] = 1-gbsg[i]

#Variables encodées en dummy variables 
idx_j = gbsg.columns[(gbsg.isin([0, 1])).all()]

In [24]:
data_missing = gbsg.mask(np.random.random(size=gbsg.shape)>0.5)

n_missing = data_missing.isna().sum().sum()
n_missing/(data_missing.shape[0]*data_missing.shape[1])

0.5053102873802582

In [25]:
class IterativeFAMDImputer(FAMD):
    def __init__(self, data, k1, k2, n_components=2):
        super().__init__(data, k1, k2)
        """Initialisation
        Args:
            n_components (int, optional): _description_. Defaults to 2.
            data (_type_, optional): _description_. Defaults to None.
        """
        self.n_components = n_components
        
    def inital_impute(self, data):
        #Pour les variables continues
        self.sj = self.df_C0.std(axis=0).to_numpy()
        Ximp_C0 = self.df_C0.copy()
        for c in self.df_C0.columns.to_numpy(): 
            Ximp_C0[c] = Ximp_C0[c].fillna(self.df_C0[c].mean())  
        self.df_C0 = Ximp_C0

        # Pour les variables catégorielles 
        self.sqrt_pj = np.sqrt(self.df_categ.sum(axis=0)/self.df_categ.shape[0]).to_numpy()
        Ximp_categ = self.df_categ.copy() 
        Ximp_categ =Ximp_categ.fillna(np.sqrt(self.df_categ.sum(axis=0)/self.df_categ.shape[0]))  
        res = Ximp_categ.copy()
        for h in range(3):
            col = [Ximp_categ.columns[h],Ximp_categ.columns[h+3]]
            somme = Ximp_categ[col].sum(axis=1)
            for j in range(self.df.shape[0]):
                res.loc[j, col] = Ximp_categ[col].iloc[j]/somme[j]
        self.df_categ = res
        self.data_concat()
        pass 

    def ponderation_gsbs(self): 
        self.df_C0 = self.df[self.k1] # redefini df_C0 avec le df actuel
        self.sj = self.df_C0.std(axis=0).to_numpy()

        self.df_categ = self.df[self.k2]
        self.sqrt_pj = np.sqrt(self.df_categ.sum(axis=0)/self.df_categ.shape[0]).to_numpy()
        res = self.df_categ.copy()
        for h in range(3):
            col = [self.df_categ.columns[h],self.df_categ.columns[h+3]]
            somme = self.df_categ[col].sum(axis=1)
            for j in range(self.df.shape[0]):
                res.loc[j, col] = self.df_categ[col].iloc[j]/somme[j]
        self.df_categ = res
        self.data_concat() # mise a jour de df_categ

    
    def impute(self, n_it, tol=1e-4,verbose=False):
        # Initialisation 
        idx_NA = 1- self.df.isna().astype(int).to_numpy() # 1 si obs 0 sinon
        #Initial imputation 
        self.inital_impute(self.df)
        self.data_concat() # on construit les données avec df_C0 et df_categ remplis 

        diff = np.inf
        last_chap = np.inf*np.ones_like(idx_NA)
        i= 0
        while i < n_it and diff > tol: 
            Z_p = self.step3() #Updating D, M already inside
            X_chap = (Z_p + self.M)@np.sqrt(self.D)
            df = pd.DataFrame(idx_NA*(self.df).to_numpy() + + (1- idx_NA)*X_chap.to_numpy(), columns=self.df.columns)
            self.df= df
            self.ponderation_gsbs()
            print(self.df)
            diff = ((X_chap - last_chap)**2).mean().mean()
            last_chap = X_chap
            i +=1 
        if i < n_it: 
            print('Converged in', i)
        else: 
              print('Maximum iterations reached')    
        return self.df 

In [26]:
famd_algo = FAMD(data=gbsg, k1=idx_k1, k2=idx_j)
# famd_algo.run_famd()

In [29]:
n_it = 1e4

In [30]:
ifamd = IterativeFAMDImputer(n_components=4, data=data_missing, k1=idx_k1, k2=idx_j)
ifamd.impute(n_it)

          pid       age      size     grade     nodes        pgr        er  \
0    0.380474  7.097760  4.498540  7.426933  0.744370   0.703882  0.003235   
1    4.539747  7.598254  4.751391  7.655109  2.221688   0.703882 -0.000918   
2    3.285912  7.736404  4.614817  7.485747  1.811107   0.703882  0.010214   
3    2.773525  6.216753  4.809820  7.673089  1.945317   0.004473  0.034842   
4    0.374709  8.979754  4.635008  7.134023  1.817001   0.700813  0.336746   
..        ...       ...       ...       ...       ...        ...       ...   
681  1.689074  7.045653  4.721678  7.673089  1.371767   0.703643  0.814244   
682  2.774119  8.841604  4.092121  7.134023  0.744370   0.703882  9.959898   
683  4.395628  7.230616  3.383283  7.048498  1.817001   8.994841  1.820516   
684  2.773739  6.972678  3.371759  7.134023  1.116554   9.663918  0.820059   
685  2.773990  6.797869  2.778605  7.134023  1.817001  14.379674  0.820059   

      rfstime      meno    hormon    status     meno_   hormon_

  result = getattr(ufunc, method)(*inputs, **kwargs)


          pid       age      size     grade     nodes        pgr        er  \
0    0.192988  3.022651  1.930245  4.180507  0.471149   0.855852 -0.051910   
1    2.302695  3.583408  2.124925  3.176399  0.949966   0.855852  1.462177   
2    1.666713  3.648561  1.905062  3.332583  0.979513   0.855852  1.136962   
3    0.832243  2.931879  2.136401  3.677795  1.231051   0.822976  0.029888   
4    0.190064  4.234936  1.843842  3.419415  1.150072   1.221815  0.277594   
..        ...       ...       ...       ...       ...        ...       ...   
681  0.856749  3.322796  2.296109  3.677795  1.081498   1.427319  0.575548   
682  2.478450  4.169784  1.989962  3.419415  0.471149   0.855852  1.721113   
683  2.229594  3.734625  1.953742  3.083060  1.150072   1.063323  1.561651   
684 -3.687303  1.233250 -3.462821  3.419415  0.706723  11.750373  0.703452   
685  1.184080  3.331800  1.758831  3.419415  1.150072   1.551649  0.703452   

       rfstime      meno    hormon    status     meno_   hormon

LinAlgError: SVD did not converge