In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import math

### FUNCTIONS

In [2]:
def get_dataset(dataset: str):
    dir_path = 'output/'
    path_num = "results_Pen-based(num)"
    path_mxd =  "results_Hypothyroid(mxd)"
    
    path_num_folds = 'results_fold_Pen-based(num)'
    path_mxd_folds = 'results_fold_Hypothyroid(mxd)'

    path_num_folds_ENN = 'results_reduction_fold_Pen-based(num)_ENN'
    path_mxd_folds_ENN = 'results_reduction_fold_Hypothyroid(mxd)_ENN'

    path_num_folds_DROP3 ='results_reduction_fold_Pen-based(num)_DROP3'
    path_mxd_folds_DROP3 ='results_reduction_fold_Hypothyroid(mxd)_DROP3'

    file_list = os.listdir(dir_path)
    for file in file_list:
        if dataset == 'num_folds' and path_num_folds in file:
            return pd.read_csv(dir_path + file)
        elif dataset == 'mxd_folds' and path_mxd_folds in file:
            return pd.read_csv(dir_path + file)
            
        elif dataset == 'num_ENN' and path_num_folds_ENN in file:
            return pd.read_csv(dir_path + file)
        elif dataset == 'mxd_ENN' and path_mxd_folds_ENN in file:
            return pd.read_csv(dir_path + file)
            
        elif dataset == 'num_DROP3' and path_num_folds_DROP3 in file:
            return pd.read_csv(dir_path + file)
        elif dataset == 'mxd_DROP3' and path_mxd_folds_DROP3 in file:
            return pd.read_csv(dir_path + file)
        
        elif dataset == 'num' and path_num in file:
            return pd.read_csv(dir_path + file)
        elif dataset == 'mxd' and path_mxd in file:
            return pd.read_csv(dir_path + file)

In [3]:
def get_corresponding_fold_config(dataset, title, number_of_k, distance, policy, weight, long_title = True):
    if weight is None:
        row = dataset[(dataset['number_of_k'] == number_of_k) & (dataset['distance'] == distance) &
                                 (dataset['policy'] == policy) & dataset['weight'].isnull()]
    else:
        row = dataset[(dataset['number_of_k'] == number_of_k) & (dataset['distance'] == distance) &
                                 (dataset['policy'] == policy) & (dataset['weight'] == weight)]
        
    if long_title:
        col = pd.DataFrame(row.iloc[:, 4:].T.values, columns=[f'{title} [{number_of_k}, {distance}, {policy}, {weight}]'])        
    else:
        col = pd.DataFrame(row.iloc[:, 4:].T.values, columns=[title])
    
    return col

In [4]:
def get_corresponding_config_reduced(dataset, title):
    dataset = dataset.drop(columns=['reduction_technique'])
    return pd.DataFrame(dataset.iloc[:, 4:].T.values, columns=[f'{title}'])

In [5]:
def paired_t_test(best_column, columns, row_title='Best Model'):
    alpha = 0.05
    results = []
    results_t_p = []
    result_df = pd.DataFrame(columns=[c.columns[0] for c in columns])
    for column in columns:
        
        stat, p = stats.ttest_rel(column_BEST.values, column.values, nan_policy='raise')
        
        if math.isnan(stat[0]) and math.isnan(p[0]): 
            results.append(True)
            results_t_p.append(('-','-'))
        else:
            results.append(p[0] > alpha)
            results_t_p.append((f'{stat[0]:.3f}',f'{p[0]:.3f}'))

            """
            if p > alpha:
                print('Same distributions (fail to reject H0)')
            else:
                print('Different distributions (reject H0)')
            """
    result_df_t_p = result_df.copy()
    result_df_t_p.loc[-1] = results_t_p
    result_df.loc[-1] = results
    result_df = result_df.rename(index={-1: row_title})
    result_df_t_p = result_df_t_p.rename(index={-1: row_title})
    return result_df, result_df_t_p

### NUMERICAL BEST vs 9

In [6]:
dataset_num_folds = get_dataset('num_folds')
#display(dataset_num_folds)

column_BEST = get_corresponding_fold_config(dataset_num_folds,'BEST', 1, 'euclidean', 'majority', 'ig', False)
column_K_3 = get_corresponding_fold_config(dataset_num_folds,'K=3', 3, 'euclidean', 'inverse_distance', 'ig', False)
column_K_5 = get_corresponding_fold_config(dataset_num_folds,'K=5', 5, 'euclidean', 'majority', 'ig', False)
column_K_7 = get_corresponding_fold_config(dataset_num_folds,'K=7', 7, 'euclidean', 'inverse_distance', 'ig', False)
column_dis_manh = get_corresponding_fold_config(dataset_num_folds,'Dist=manh', 1, 'manhattan', 'majority', 'ig', False)
column_dis_cheb = get_corresponding_fold_config(dataset_num_folds,'Dist=cheb', 1, 'chebyshev', 'majority', 'ig', False)
column_pol_invd = get_corresponding_fold_config(dataset_num_folds,'Pol=inv_d', 1, 'euclidean', 'inverse_distance', 'ig', False)
column_pol_shep = get_corresponding_fold_config(dataset_num_folds,'Pol=shep', 1, 'euclidean', 'sheppard', 'ig', False)
column_wei_none = get_corresponding_fold_config(dataset_num_folds,'Wei=none', 1, 'manhattan', 'majority', None, False)
column_wei_rel = get_corresponding_fold_config(dataset_num_folds,'Wei=relf', 1, 'euclidean', 'majority', 'relieff', False)

In [7]:
columns_9 = [column_K_3, column_K_5, column_K_7, column_dis_manh, column_dis_cheb, column_pol_invd, 
column_pol_shep, column_wei_none, column_wei_rel]
#statistical_test = column_BEST.join(columns)
#display(statistical_test)
dataframe_9_num, t_p_values_9_num = paired_t_test(column_BEST, columns_9)
display(dataframe_9_num)
display(t_p_values_9_num)

Unnamed: 0,K=3,K=5,K=7,Dist=manh,Dist=cheb,Pol=inv_d,Pol=shep,Wei=none,Wei=relf
Best Model,True,True,True,True,False,True,True,True,False


Unnamed: 0,K=3,K=5,K=7,Dist=manh,Dist=cheb,Pol=inv_d,Pol=shep,Wei=none,Wei=relf
Best Model,"(1.490, 0.170)","(0.784, 0.453)","(1.299, 0.226)","(0.764, 0.464)","(4.343, 0.002)","(-, -)","(-, -)","(1.404, 0.194)","(5.690, 0.000)"


### NUMERICAL BEST vs 13

In [8]:
dataset_num_folds = get_dataset('num_folds')
#display(dataset_num_folds)

column_BEST = get_corresponding_fold_config(dataset_num_folds,'BEST', 1, 'euclidean', 'majority', 'ig', False)
column_K_1 = get_corresponding_fold_config(dataset_num_folds,'*K=1', 1, 'euclidean', 'inverse_distance', 'ig', False)
column_K_3 = get_corresponding_fold_config(dataset_num_folds,'K=3', 3, 'euclidean', 'inverse_distance', 'ig', False)
column_K_5 = get_corresponding_fold_config(dataset_num_folds,'K=5', 5, 'euclidean', 'majority', 'ig', False)
column_K_7 = get_corresponding_fold_config(dataset_num_folds,'K=7', 7, 'euclidean', 'inverse_distance', 'ig', False)

column_dis_eucl = get_corresponding_fold_config(dataset_num_folds,'*Dist=eucl', 1, 'euclidean', 'inverse_distance', 'ig', False)
column_dis_manh = get_corresponding_fold_config(dataset_num_folds,'Dist=manh', 1, 'manhattan', 'majority', 'ig', False)
column_dis_cheb = get_corresponding_fold_config(dataset_num_folds,'Dist=cheb', 1, 'chebyshev', 'majority', 'ig', False)

column_pol_maj = get_corresponding_fold_config(dataset_num_folds,'Pol=maj', 1, 'manhattan', 'majority', 'ig', False)
column_pol_invd = get_corresponding_fold_config(dataset_num_folds,'Pol=inv_d', 1, 'euclidean', 'inverse_distance', 'ig', False)
column_pol_shep = get_corresponding_fold_config(dataset_num_folds,'Pol=shep', 1, 'euclidean', 'sheppard', 'ig', False)

column_wei_ig = get_corresponding_fold_config(dataset_num_folds,'*Wei=ig', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_wei_none = get_corresponding_fold_config(dataset_num_folds,'Wei=none', 1, 'manhattan', 'majority', None, False)
column_wei_rel = get_corresponding_fold_config(dataset_num_folds,'Wei=relf', 1, 'euclidean', 'majority', 'relieff', False)

In [9]:
columns_13 = [column_K_1, column_K_3, column_K_5, column_K_7, column_dis_eucl, column_dis_manh, column_dis_cheb, 
              column_pol_maj, column_pol_invd, column_pol_shep, 
              column_wei_none, column_wei_ig, column_wei_rel]
#statistical_test = column_BEST.join(columns)
#display(statistical_test)
dataframe_13_num, t_p_values_13_num = paired_t_test(column_BEST, columns_13)
display(dataframe_13_num)
display(t_p_values_13_num)

Unnamed: 0,*K=1,K=3,K=5,K=7,*Dist=eucl,Dist=manh,Dist=cheb,Pol=maj,Pol=inv_d,Pol=shep,Wei=none,*Wei=ig,Wei=relf
Best Model,True,True,True,True,True,True,False,True,True,True,True,True,False


Unnamed: 0,*K=1,K=3,K=5,K=7,*Dist=eucl,Dist=manh,Dist=cheb,Pol=maj,Pol=inv_d,Pol=shep,Wei=none,*Wei=ig,Wei=relf
Best Model,"(-, -)","(1.490, 0.170)","(0.784, 0.453)","(1.299, 0.226)","(-, -)","(0.764, 0.464)","(4.343, 0.002)","(0.764, 0.464)","(-, -)","(-, -)","(1.404, 0.194)","(0.764, 0.464)","(5.690, 0.000)"


### MIXED BEST vs 9

In [10]:
dataset_mxd_folds = get_dataset('mxd_folds')
#display(dataset_num_folds)

column_BEST = get_corresponding_fold_config(dataset_mxd_folds,'BEST', 1, 'manhattan', 'majority', 'ig', False)

column_K_3 = get_corresponding_fold_config(dataset_mxd_folds,'K=3', 3, 'manhattan', 'inverse_distance', 'ig', False)
column_K_5 = get_corresponding_fold_config(dataset_mxd_folds,'K=5', 5, 'manhattan', 'inverse_distance', 'ig', False)
column_K_7 = get_corresponding_fold_config(dataset_mxd_folds,'K=7', 7, 'manhattan', 'inverse_distance', 'ig', False)
column_dis_eucl = get_corresponding_fold_config(dataset_mxd_folds,'Dist=eucl', 3, 'euclidean', 'inverse_distance', 'ig', False)
column_dis_cheb = get_corresponding_fold_config(dataset_mxd_folds,'Dist=cheb', 3, 'chebyshev', 'inverse_distance', 'ig', False)
column_pol_invd = get_corresponding_fold_config(dataset_mxd_folds,'Pol=inv_d', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_pol_shep = get_corresponding_fold_config(dataset_mxd_folds,'Pol=shep', 1, 'manhattan', 'sheppard', 'ig', False)
column_wei_none = get_corresponding_fold_config(dataset_mxd_folds,'Wei=none', 5, 'manhattan', 'inverse_distance', None, False)
column_wei_rel = get_corresponding_fold_config(dataset_mxd_folds,'Wei=relf', 5, 'manhattan', 'inverse_distance', 'relieff', False)

In [11]:
columns_9 = [column_K_3, column_K_5, column_K_7, column_dis_eucl, column_dis_cheb, column_pol_invd, 
column_pol_shep, column_wei_none, column_wei_rel]
#statistical_test = column_BEST.join(columns)
#display(statistical_test)
dataframe_9_mxd, t_p_values_9_mxd = paired_t_test(column_BEST, columns_9)
display(dataframe_9_mxd)
display(t_p_values_9_mxd)

Unnamed: 0,K=3,K=5,K=7,Dist=eucl,Dist=cheb,Pol=inv_d,Pol=shep,Wei=none,Wei=relf
Best Model,True,True,False,False,True,True,True,False,False


Unnamed: 0,K=3,K=5,K=7,Dist=eucl,Dist=cheb,Pol=inv_d,Pol=shep,Wei=none,Wei=relf
Best Model,"(1.136, 0.285)","(1.059, 0.317)","(2.340, 0.044)","(2.632, 0.027)","(2.170, 0.058)","(-, -)","(-, -)","(6.437, 0.000)","(6.437, 0.000)"


### MIXED BEST vs 13

In [12]:
dataset_mxd_folds = get_dataset('mxd_folds')
#display(dataset_num_folds)

column_BEST = get_corresponding_fold_config(dataset_mxd_folds,'BEST', 1, 'manhattan', 'majority', 'ig', False)

column_K_1 = get_corresponding_fold_config(dataset_mxd_folds,'*K=1', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_K_3 = get_corresponding_fold_config(dataset_mxd_folds,'K=3', 3, 'manhattan', 'inverse_distance', 'ig', False)
column_K_5 = get_corresponding_fold_config(dataset_mxd_folds,'K=5', 5, 'manhattan', 'inverse_distance', 'ig', False)
column_K_7 = get_corresponding_fold_config(dataset_mxd_folds,'K=7', 7, 'manhattan', 'inverse_distance', 'ig', False)

column_dis_manh = get_corresponding_fold_config(dataset_mxd_folds,'*Dist=manh', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_dis_eucl = get_corresponding_fold_config(dataset_mxd_folds,'Dist=eucl', 3, 'euclidean', 'inverse_distance', 'ig', False)
column_dis_cheb = get_corresponding_fold_config(dataset_mxd_folds,'Dist=cheb', 3, 'chebyshev', 'inverse_distance', 'ig', False)

column_pol_maj = get_corresponding_fold_config(dataset_mxd_folds,'Pol=maj', 5, 'manhattan', 'majority', 'ig', False)
column_pol_invd = get_corresponding_fold_config(dataset_mxd_folds,'Pol=inv_d', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_pol_shep = get_corresponding_fold_config(dataset_mxd_folds,'Pol=shep', 1, 'manhattan', 'sheppard', 'ig', False)

column_wei_ig = get_corresponding_fold_config(dataset_mxd_folds,'*Wei=ig', 1, 'manhattan', 'inverse_distance', 'ig', False)
column_wei_none = get_corresponding_fold_config(dataset_mxd_folds,'Wei=none', 5, 'manhattan', 'inverse_distance', None, False)
column_wei_rel = get_corresponding_fold_config(dataset_mxd_folds,'Wei=relF', 5, 'manhattan', 'inverse_distance', 'relieff', False)

In [13]:
columns_13 = [column_K_1, column_K_3, column_K_5, column_K_7, column_dis_eucl, column_dis_manh, column_dis_cheb, 
              column_pol_maj, column_pol_invd, column_pol_shep, 
              column_wei_none, column_wei_ig, column_wei_rel]
#statistical_test = column_BEST.join(columns)
#display(statistical_test)
dataframe_13_mxd, t_p_values_13_mxd = paired_t_test(column_BEST, columns_13)
display(dataframe_13_mxd)
display(t_p_values_13_mxd)

Unnamed: 0,*K=1,K=3,K=5,K=7,Dist=eucl,*Dist=manh,Dist=cheb,Pol=maj,Pol=inv_d,Pol=shep,Wei=none,*Wei=ig,Wei=relF
Best Model,True,True,True,False,False,True,True,True,True,True,False,True,False


Unnamed: 0,*K=1,K=3,K=5,K=7,Dist=eucl,*Dist=manh,Dist=cheb,Pol=maj,Pol=inv_d,Pol=shep,Wei=none,*Wei=ig,Wei=relF
Best Model,"(-, -)","(1.136, 0.285)","(1.059, 0.317)","(2.340, 0.044)","(2.632, 0.027)","(-, -)","(2.170, 0.058)","(1.547, 0.156)","(-, -)","(-, -)","(6.437, 0.000)","(-, -)","(6.437, 0.000)"


### REDUCED NUMERICAL

In [14]:
dataset_enn_num = get_dataset('num_ENN')
dataset_drop3_num = get_dataset('num_DROP3')
# display(dataset_enn_num)
# display(dataset_drop3_num)

enn_num = get_corresponding_config_reduced(dataset_enn_num, 'ENN')
drop3_num = get_corresponding_config_reduced(dataset_drop3_num, 'DROP3')

In [15]:
dataframe_red_num, t_p_values_red_num = paired_t_test(enn_num, [drop3_num], 'ENN')
display(dataframe_red_num)
display(t_p_values_red_num)

Unnamed: 0,DROP3
ENN,False


Unnamed: 0,DROP3
ENN,"(23.925, 0.000)"


### REDUCED MIXED

In [16]:
dataset_enn_mxd = get_dataset('mxd_ENN')
dataset_drop3_mxd = get_dataset('mxd_DROP3')
# display(dataset_enn_mxd)
# display(dataset_drop3_mxd)

enn_mxd = get_corresponding_config_reduced(dataset_enn_mxd, 'ENN')
drop3_mxd = get_corresponding_config_reduced(dataset_drop3_mxd, 'DROP3')

In [17]:
dataframe_red_mxd, t_p_values_red_mxd = paired_t_test(enn_mxd, [drop3_mxd], 'ENN')
display(dataframe_red_mxd)
display(t_p_values_red_mxd)

Unnamed: 0,DROP3
ENN,False


Unnamed: 0,DROP3
ENN,"(6.348, 0.000)"
