In [1]:
import torch
import torchvision
import torch.nn as nn
import numpy as np
import scipy.stats
import scipy.io
import scipy.sparse
from scipy.io import loadmat
import pandas as pd
import matplotlib.pyplot as plt
import torch.distributions as td

import seaborn as sns
import datetime as dt
from tqdm import tqdm

from time import time
from hyperimpute.plugins.imputers import Imputers

tex_fonts = {
    # Use LaTeX to write all text
    "text.usetex": True,
    "font.family": "serif",
    # Use 10pt font in plots, to match 10pt font in document
    "axes.labelsize": 10,
    "font.size": 10,
    # Make the legend/label fonts a little smaller
    "legend.fontsize": 8,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8
}
plt.rcParams.update(tex_fonts)

from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import BayesianRidge
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.impute import SimpleImputer

from source.auxiliary_functions import remove_outliers_DDC, apply_estimator, contaminate_bernoulli, low_rank

def estimate_cov_MAR(X, mask):
    sigma = np.cov(X.T)
    deltas = np.sum(mask, axis=0)/mask.shape[0]
    cor_mat = 1/np.outer(deltas, deltas)
    cor_2 = np.eye(deltas.shape[0]) * 1/deltas
    sigma_tilde = (cor_2 - cor_mat) * np.eye(sigma.shape[0]) * sigma.diagonal() + cor_mat * sigma
    return sigma_tilde


def contaminate_MAR(X, deltas, intensity=0):
    Y = np.zeros(X.shape)
    mask = np.zeros(X.shape)
    for i,delta in enumerate(deltas):
        bernoulli_mask = np.random.binomial(p=delta, n=1, size=X.shape[0])
        mask[:,i] = bernoulli_mask
    conta = mask*X + (1-mask)*intensity
    return mask*X, mask

# Contaminated cell-wise at random

In [4]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def rel_dist(A,B):
    if A is None or B is None:
        return None
    norm = max(np.linalg.norm(A, ord=2), np.linalg.norm(B, ord=2))
    return np.linalg.norm(A-B, ord=2)/norm

def test_estimators_missing(data, nrep):
    print("dim : ", data.shape[1])
    print("nsamples : ", data.shape[0])
    truth = np.cov(data.T)
    
    np.random.seed(39)
    #deltas = np.random.uniform(0.6, 1., size=data.shape[1])
    deltas = 1 - sigmoid(np.sum(data[:15,:], axis=0)/15)
    
    C = {}
    
    C['classical'] = np.zeros((nrep, data.shape[1], data.shape[1]))
    C['MV'] = np.zeros((nrep, data.shape[1], data.shape[1]))
    C['II'] = np.zeros((nrep, data.shape[1], data.shape[1]))
    C['KNN'] = np.zeros((nrep, data.shape[1], data.shape[1]))
    C['MIWAE'] = np.zeros((nrep, data.shape[1], data.shape[1]))
    C['GAIN'] = np.zeros((nrep, data.shape[1], data.shape[1]))

    for k in tqdm(range(nrep)):
        np.random.seed(k)
        
        conta_data, mask = contaminate_MAR(data, deltas)
        
        data_nan = conta_data*mask
        data_nan[data_nan == 0] = np.nan
        
        C['classical'][k] = np.cov(conta_data.T)
        
        # DDC 99
        C['MV'][k] = estimate_cov_MAR(conta_data, mask)
            
        # GAIN
        gain = Imputers().get('gain')
        x_gain = gain.fit_transform(data_nan)
        C['GAIN'][k] = np.cov(x_gain.T)
        
        #MIWAE
        miwae = Imputers().get('miwae')
        x_miwae = miwae.fit_transform(data_nan)
        C['MIWAE'][k] = np.cov(x_miwae.T)

        #3. multiple imputation (inspired by MICE)
        imputer = IterativeImputer(max_iter=10, random_state=0)
        MI_data = imputer.fit_transform(data_nan)
        C['II'] = np.cov(MI_data.T)

        #4. knnimpute
        imputer = KNNImputer(n_neighbors=5, weights="uniform")
        knn_data = imputer.fit_transform(data_nan)  
        C['KNN'] = np.cov(knn_data.T)

    cols = ['classical','MV', 'II', 'KNN', 'MIWAE','GAIN']
    index = ['Truth','classical','MV', 'II', 'KNN', 'MIWAE']
    results = pd.DataFrame('-', columns=cols,
                                index=index)
    stds = pd.DataFrame('-', columns=cols,
                                index=index)   
    
    for c in cols:
        results.loc['Truth',c] = np.mean([rel_dist(truth, C[c][k]) for k in range(nrep)])*100
        stds.loc['Truth',c] = np.std([rel_dist(truth, C[c][k]) for k in range(nrep)])*100

    for i,c1 in enumerate(cols[:-1]):
        for c2 in cols[i+1:]:
            results.loc[c1, c2] = np.mean([rel_dist(C[c1][k], C[c2][k]) for k in range(nrep)])*100
            stds.loc[c1, c2] = np.std([rel_dist(C[c1][k], C[c2][k]) for k in range(nrep)])*100

    return results, stds

In [5]:
from source.real_experiment import clean_abalone

data = pd.read_csv('datasets/abalone.data', sep=',', header=None)
data, _ = clean_abalone()

test_estimators_missing(data, 5)

dim :  7
nsamples :  4173
[0.59246331 0.58889257 0.60970577 0.61786702 0.64564068 0.63500372
 0.57420827]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:49<00:00, 69.97s/it]


(           classical         MV          II         KNN      MIWAE       GAIN
 Truth      59.326807   1.870525    4.280006    4.547378   8.602209  16.537331
 classical          -  59.299447  138.070906  136.802583  55.623436  51.325908
 MV                 -          -    4.809351    5.061317   8.684462  16.577879
 II                 -          -           -    1.329209   6.948025  16.444822
 KNN                -          -           -           -   6.424047  15.766778
 MIWAE              -          -           -           -          -   9.445398,
           classical        MV        II       KNN     MIWAE      GAIN
 Truth      0.584246  0.572091  1.445084  1.310665  0.625796  2.093862
 classical         -  0.264014  3.337113  3.973836  0.377929  1.333995
 MV                -         -  1.655874  1.563614  0.821969  2.286093
 II                -         -         -  0.172358  0.781207  2.605383
 KNN               -         -         -         -  1.066904   3.33805
 MIWAE             -

In [6]:
from source.real_experiment import clean_breast_cancer

data, _ = clean_breast_cancer()

test_estimators_missing(data, 5)

dim :  30
nsamples :  569
[0.36840377 0.46269955 0.3542269  0.38172801 0.30758036 0.21709856
 0.28135382 0.27643322 0.29035039 0.27979016 0.38286465 0.50789481
 0.35400085 0.3969512  0.5255049  0.29998938 0.40385511 0.31752147
 0.39284664 0.32315373 0.34704685 0.42986816 0.33650191 0.36653389
 0.3086377  0.19428268 0.27274002 0.2400313  0.21947384 0.17401771]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:03<00:00, 24.78s/it]


(           classical        MV          II         KNN       MIWAE       GAIN
 Truth      88.629152  21.74112   69.786231   61.972473   87.959387   37.45866
 classical          -  88.38854  462.371178  339.230948   12.823254  83.663582
 MV                 -         -   71.627347   64.454023   87.651948  40.278171
 II                 -         -           -   43.475742   451.84346  86.961383
 KNN                -         -           -           -  312.843322  62.250764
 MIWAE              -         -           -           -           -  82.645671,
           classical        MV         II        KNN      MIWAE       GAIN
 Truth       0.46263  3.000551    2.53098  10.039639   0.430076    5.77819
 classical         -  0.334218  46.164052   95.07513   0.932186   1.464802
 MV                -         -   2.588496   9.289193    0.36079   3.301311
 II                -         -          -   3.621349  55.737244  17.633547
 KNN               -         -          -          -  84.111752    5.02