# Code comparing different models and imputation methods for different missingness mechanisms.

In [1]:
import pandas as pd
import numpy as np
from paired_ttest import *
from linear_regression import *

Baseline values.

In [2]:
# To obtain the measure of the variance below, run in R:
# > results <- make_data3bis(dim=9, size=200000)
# > var(results$y)

y_variances = {
    'mcar': 33,#10,
    'mnar': 33,
    'pred': 35,#10,
    'linearlinear': 25.4,
    'linearnonlinear': 1710,
    'nonlinearnonlinear': 1082,
}

In [3]:
def compute_range(data):
    return np.min(data), np.max(data)

In [4]:
data_file_path = '../final_results/results-miss6-rho5/results/'

output_file_path = '../final_results/results-miss6-rho5/figures/'

### Compare different imputation methods across all models in the MCAR missingness mechanism.

We would like to find out which imputation method is the most effective for each learning algorithm. Therefore, we loop through each learning algorithm that we used (decision tree, random forest, xgboost, svm, knn) and perform the linear regression test for each.

The code is implemented in separate Python files.

In [5]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_mcar.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['mcar']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

compute_range(data['R2'])

create_coefficient_table(data, output_file_path+'coef_table_mcar.pdf')

In [6]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_mnar.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['mnar']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

compute_range(data['R2'])

create_coefficient_table(data, output_file_path+'coef_table_mnar.pdf')

In [7]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_pred.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['pred']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

compute_range(data['R2'])

create_coefficient_table(data, output_file_path+'coef_table_pred.pdf')

### Paired t-tests

Perform paired t-tests between each pair of methods. Save the p-values into a matrix.

The following code runs paired t-tests between five different methods and creates a heatmap showing the resulting p-values

In [8]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_mcar.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['mcar']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

paired_ttest(data, output_file_path+'paired_ttests_matrices_mcar.pdf')

In [9]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_mnar.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['mnar']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

paired_ttest(data, output_file_path+'paired_ttests_matrices_mnar.pdf')

In [10]:
# must set header=0 or else the first row of the csv will be ignored
data = pd.read_csv(data_file_path+f'scores_pred.csv', header=0,
                    names=['index', 'score', 'method', 'forest'])

# Knowing the variance of y, we can extract the R2
data['R2'] = 1 - data['score'] / y_variances['pred']
# The fold number is encoded at the end of the name of the index
data['fold'] = data['index'].str.extract('(\d+)$').astype(int)

paired_ttest(data, output_file_path+'paired_ttests_matrices_pred.pdf')