In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats

### FUNCTIONS

In [2]:
def get_dataset(dataset: str):
    dir_path = 'output/'
    path_num = "results_Pen-based(num)"
    path_mxd =  "results_Hypothyroid(mxd)"
    
    path_num_folds = 'results_fold_Pen-based(num)'
    path_mxd_folds = 'results_fold_Hypothyroid(mxd)'

    path_num_folds_ENN = 'results_reduction_fold_Pen-based(num)_ENN'
    path_mxd_folds_ENN = 'results_reduction_fold_Hypothyroid(mxd)_ENN'

    path_num_folds_DROP3 ='results_reduction_fold_Pen-based(num)_DROP3'
    path_mxd_folds_DROP3 ='results_reduction_fold_Hypothyroid(mxd)_DROP3'

    file_list = os.listdir(dir_path)
    for file in file_list:
        if dataset == 'num' and path_num in file:
            return pd.read_csv(dir_path + file)
        if dataset == 'mxd' and path_mxd in file:
            return pd.read_csv(dir_path + file)
        
        if dataset == 'num_folds' and path_num_folds in file:
            return pd.read_csv(dir_path + file)
        if dataset == 'mxd_folds' and path_mxd_folds in file:
            return pd.read_csv(dir_path + file)
            
        if dataset == 'num_ENN' and path_num_folds_ENN in file:
            return pd.read_csv(dir_path + file)
        if dataset == 'mxd_ENN' and path_mxd_folds_ENN in file:
            return pd.read_csv(dir_path + file)
            
        if dataset == 'num_DROP3' and path_mxd_folds in file:
            return pd.read_csv(dir_path + file)
        if dataset == 'mxd_DROP3' and path_mxd_folds in file:
            return pd.read_csv(dir_path + file)    

In [3]:
def get_corresponding_fold_config(dataset, title, number_of_k, distance, policy, weight):
    if weight is None:
        row = dataset[(dataset['number_of_k'] == number_of_k) & (dataset['distance'] == distance) &
                                 (dataset['policy'] == policy) & dataset['weight'].isnull()]
    else:
        row = dataset[(dataset['number_of_k'] == number_of_k) & (dataset['distance'] == distance) &
                                 (dataset['policy'] == policy) & (dataset['weight'] == weight)]
    
    
    return pd.DataFrame(row.iloc[:, 4:].T.values, columns=[f'{title} [{number_of_k}, {distance}, {policy}, {weight}]'])

In [4]:
def get_corresponding_config_reduced(dataset, title):
    dataset = dataset.drop(columns=['reduction_technique'])
    return pd.DataFrame(dataset.iloc[:, 4:].T.values, columns=[f'{title}'])

In [5]:
def get_corresponding_avr_config(datasetA, datasetB, title, number_of_k, distance, policy, weight):
    if weight is None:
        rowA = datasetA[(datasetA['number_of_k'] == number_of_k) & (datasetA['distance'] == distance) &
                                 (datasetA['policy'] == policy) & datasetA['weight'].isnull()]
        rowB = datasetB[(datasetB['number_of_k'] == number_of_k) & (datasetB['distance'] == distance) &
                                 (datasetB['policy'] == policy) & datasetB['weight'].isnull()]
    else:
        rowA = datasetA[(datasetA['number_of_k'] == number_of_k) & (datasetA['distance'] == distance) &
                                 (datasetA['policy'] == policy) & (datasetA['weight'] == weight)]
        rowB = datasetB[(datasetB['number_of_k'] == number_of_k) & (datasetB['distance'] == distance) &
                                 (datasetB['policy'] == policy) & (datasetB['weight'] == weight)]
    
    colA = pd.DataFrame(rowA['average_accuracy'].T.values, columns=[f'{title} [{number_of_k}, {distance}, {policy}, {weight}]'])
    colB = pd.DataFrame(rowB['average_accuracy'].T.values, columns=[f'{title} [{number_of_k}, {distance}, {policy}, {weight}]'])
    return pd.concat([colA,colB], ignore_index=True)

In [6]:
def ranking_model(results_aggregate):
    ranking = pd.DataFrame(columns=results_aggregate.columns)
    for i in range(results_aggregate.shape[0]):
        ranking.loc[i, results_aggregate.iloc[i].rank(ascending=False).index]=results_aggregate.iloc[i].rank(ascending=False)
    return ranking

In [7]:
def friedman_test(*args):
    column0 = args[0]
    column1 = args[1]
    column2 = args[2]
    statistical_test = column0.join([column1,column2])
    if len(args) == 4:
        column3 = args[3]
        statistical_test = column0.join([column1,column2,column3])
        print(4)
    if len(args) == 5:
        column3 = args[3]
        column4 = args[4]
        statistical_test = column0.join([column1,column2,column3,column4])
        print(5)
    
    display(statistical_test)

    ranking = ranking_model(statistical_test)
    display(ranking)
    ranking_col = [ranking[column].values.tolist() for column in ranking.columns]

    stat, p = stats.friedmanchisquare(ranking_col[0],ranking_col[1],ranking_col[2])
    if len(args) == 4:
        print(4)
        stats.friedmanchisquare(ranking_col[0],ranking_col[1],ranking_col[2],ranking_col[3])
    if len(args) == 5:
        print(5)
        stats.friedmanchisquare(ranking_col[0],ranking_col[1],ranking_col[2],ranking_col[3],ranking_col[4])
    
    print("stat =",stat ,"p =", p)
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')

    return stat, p

### NUMERICAL

In [8]:
#NUMERICAL BEST
# best K  ==>  3 euclidean majority 0.9937239520518476
# best distance ==> 3 manhattan majority 0.9935416374007999 it should be the best k(2%), but this last less then half 66.18204426765442
# best policy ==> 3 euclidean inverse_distance 0.9938151097546735
# best weight ==> 3	euclidean majority ig 0.9934499832262802 for 131.53082447052003

dataset_num_folds = get_dataset('num_folds')
display(dataset_num_folds)

column0 = get_corresponding_fold_config(dataset_num_folds,'ABS. BEST ', 1, 'euclidean', 'majority', 'ig')
#column1 = get_corresponding_fold_config(dataset,'BEST K conf.', 3, 'euclidean', 'majority', None)
column1 = get_corresponding_fold_config(dataset_num_folds,'BEST K conf ', 1, 'euclidean', 'inverse_distance', 'ig')
column2 = get_corresponding_fold_config(dataset_num_folds,'BEST Distance ', 3, 'manhattan', 'majority', None)
column3 = get_corresponding_fold_config(dataset_num_folds,'BEST Policy ', 3, 'euclidean', 'inverse_distance', None)
column4 = get_corresponding_fold_config(dataset_num_folds,'BEST Weight', 3, 'euclidean', 'majority', 'ig')

stat, p = friedman_test(column0, column1, column2, column3, column4)

Unnamed: 0,number_of_k,distance,policy,weight,acc_fold0,acc_fold1,acc_fold2,acc_fold3,acc_fold4,acc_fold5,acc_fold6,acc_fold7,acc_fold8,acc_fold9
0,1,manhattan,majority,relieff,0.989071,0.986351,0.987238,0.984545,0.989982,0.989982,0.989111,0.986364,0.992714,0.988203
1,3,manhattan,majority,relieff,0.988160,0.988171,0.990884,0.985455,0.984517,0.990893,0.991833,0.984545,0.991803,0.987296
2,5,manhattan,majority,relieff,0.987250,0.989081,0.993619,0.983636,0.984517,0.989071,0.983666,0.982727,0.991803,0.987296
3,7,manhattan,majority,relieff,0.983607,0.985441,0.989061,0.983636,0.981785,0.989071,0.986388,0.982727,0.990893,0.984574
4,1,chebyshev,majority,relieff,0.981785,0.981802,0.980857,0.976364,0.976321,0.972678,0.981851,0.980909,0.980874,0.985481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,7,chebyshev,majority,,0.989071,0.985441,0.989061,0.981818,0.979964,0.989982,0.989111,0.988182,0.989982,0.987296
100,1,euclidean,inverse_distance,ig,0.991803,0.993631,0.996354,0.992727,0.995446,0.994536,0.995463,0.990909,0.996357,0.995463
101,3,euclidean,inverse_distance,ig,0.992714,0.991811,0.995442,0.991818,0.992714,0.994536,0.996370,0.992727,0.994536,0.991833
102,5,euclidean,inverse_distance,ig,0.995446,0.989991,0.993619,0.991818,0.991803,0.996357,0.995463,0.993636,0.996357,0.990018


5


Unnamed: 0,"ABS. BEST [1, euclidean, majority, ig]","BEST K conf [1, euclidean, inverse_distance, ig]","BEST Distance [3, manhattan, majority, None]","BEST Policy [3, euclidean, inverse_distance, None]","BEST Weight [3, euclidean, majority, ig]"
0,0.991803,0.991803,0.993625,0.996357,0.992714
1,0.993631,0.993631,0.990901,0.991811,0.991811
2,0.996354,0.996354,0.996354,0.996354,0.994531
3,0.992727,0.992727,0.991818,0.99,0.991818
4,0.995446,0.995446,0.991803,0.993625,0.992714
5,0.994536,0.994536,0.994536,0.995446,0.995446
6,0.995463,0.995463,0.995463,0.994555,0.99637
7,0.990909,0.990909,0.992727,0.992727,0.992727
8,0.996357,0.996357,0.996357,0.994536,0.994536
9,0.995463,0.995463,0.991833,0.99274,0.991833


Unnamed: 0,"ABS. BEST [1, euclidean, majority, ig]","BEST K conf [1, euclidean, inverse_distance, ig]","BEST Distance [3, manhattan, majority, None]","BEST Policy [3, euclidean, inverse_distance, None]","BEST Weight [3, euclidean, majority, ig]"
0,4.5,4.5,2.0,1.0,3.0
1,1.5,1.5,5.0,3.5,3.5
2,2.5,2.5,2.5,2.5,5.0
3,1.5,1.5,3.5,5.0,3.5
4,1.5,1.5,5.0,3.0,4.0
5,4.0,4.0,4.0,1.5,1.5
6,3.0,3.0,3.0,5.0,1.0
7,4.5,4.5,2.0,2.0,2.0
8,2.0,2.0,2.0,4.5,4.5
9,1.5,1.5,4.5,3.0,4.5


5
stat = 1.3333333333333524 p = 0.5134171190325874
Same distributions (fail to reject H0)


### MIXED

### Same model on Numerical and Mixed comparison 

In [9]:
dataset_num = get_dataset('num')
dataset_mxd = get_dataset('mxd')
col1 = get_corresponding_avr_config(dataset_num, dataset_mxd, 'ABS. BEST NUM ', 1, 'euclidean', 'majority', 'ig')
col2 = get_corresponding_avr_config(dataset_num, dataset_mxd, 'ABS. BEST MXD ', 1, 'manhattan', 'majority', 'ig')
col3 = get_corresponding_avr_config(dataset_num, dataset_mxd, 'ABS. BEST MXD ', 1, 'manhattan', 'inverse_distance', 'ig')

display(dataset_num)
display(dataset_mxd)
display(col1)
display(col2)
display(col3)
stat, p = friedman_test(col1, col2, col3)
#column = get_corresponding_configuration_reduced(dataset_num_red, 'ENN')
#display(column)

Unnamed: 0,number_of_k,distance,policy,weight,average_accuracy,average_efficiency
0,1,euclidean,majority,,0.993359,134.402183
1,3,euclidean,majority,,0.993724,135.011167
2,5,euclidean,majority,,0.992632,138.311276
3,7,euclidean,majority,,0.992086,140.711696
4,1,euclidean,inverse_distance,,0.993359,133.151110
...,...,...,...,...,...,...
103,7,chebyshev,inverse_distance,relieff,0.979257,90.165783
104,1,chebyshev,sheppard,relieff,0.979892,90.113688
105,3,chebyshev,sheppard,relieff,0.980896,90.822161
106,5,chebyshev,sheppard,relieff,0.979530,90.250236


Unnamed: 0,number_of_k,distance,policy,weight,average_accuracy,average_efficiency
0,1,euclidean,majority,,0.915982,15.763588
1,3,euclidean,majority,,0.935859,15.020269
2,5,euclidean,majority,,0.935593,14.755662
3,7,euclidean,majority,,0.935854,14.827329
4,1,euclidean,inverse_distance,,0.915982,14.920233
...,...,...,...,...,...,...
103,7,chebyshev,inverse_distance,relieff,0.928701,10.381037
104,1,chebyshev,sheppard,relieff,0.910405,10.253037
105,3,chebyshev,sheppard,relieff,0.924726,10.440198
106,5,chebyshev,sheppard,relieff,0.930287,10.391979


Unnamed: 0,"ABS. BEST NUM [1, euclidean, majority, ig]"
0,0.994269
1,0.948836


Unnamed: 0,"ABS. BEST MXD [1, manhattan, majority, ig]"
0,0.993814
1,0.96103


Unnamed: 0,"ABS. BEST MXD [1, manhattan, inverse_distance, ig]"
0,0.993814
1,0.96103


Unnamed: 0,"ABS. BEST NUM [1, euclidean, majority, ig]","ABS. BEST MXD [1, manhattan, majority, ig]","ABS. BEST MXD [1, manhattan, inverse_distance, ig]"
0,0.994269,0.993814,0.993814
1,0.948836,0.96103,0.96103


Unnamed: 0,"ABS. BEST NUM [1, euclidean, majority, ig]","ABS. BEST MXD [1, manhattan, majority, ig]","ABS. BEST MXD [1, manhattan, inverse_distance, ig]"
0,1,2.5,2.5
1,3,1.5,1.5


stat = 0.0 p = 1.0
Same distributions (fail to reject H0)
