In [1]:
import pickle
import numpy as np
from scipy import stats
## stats.ttest_ind, stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative='two-sided')
import pandas as pd

In [2]:
def which_is_better(mean1, std1, nobs1, mean2, std2, nobs2, alpha_one_tail=0.025):
    # Test whether mean(M1) > mean(M2) using Welch's t-test
    # at default alpha = 0.025 significance level for the one tail test
    #
    # M1 - relevant
    # M2 - irrelevant
    #
    # Note that stats.ttest_ind return the two-tailed p-value
    #
    # Null hypothesis: mean(M1) <= mean(M2)
    # Alternative: mean(M1) > mean(M2) 
    #
    # REJECT if pvalue/2 < alpha
    
    # tstat, pvalue = stats.ttest_ind(M1_values, M2_values, equal_var = False)
    tstat, pvalue = stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var = False)    
    print('tstat = ', tstat)
    print('pvalue = ', pvalue)
    if pvalue/2 < alpha_one_tail:
        if tstat < 0.0:
            print('Null rejected: more confident that mean(M1) < mean(M2)')
            return 0
        else:
            # you want to be here
            print('Null rejected: more confident that mean(M1) > mean(M2)')
            return 1-pvalue
        return True
    else:
        print('Cannot reject null: no confidence which one is better')
        return 0

In [3]:
list_dt_st = [('nonlinear', 'mlp')] ## [('nonlinear', 'mlp'), ('linear', 'mlp')]
list_n = [200, 1000] ## [200, 1000]
list_d = [20] ## [10, 20]
list_s0_factor = [1, 4] ## [1, 4]
list_gt = ['ER', 'SF'] ## ['ER', 'SF']
list_should_std = [False, True] ## [False, True]
n_trials = 50 ## 10

In [4]:
d_result = {}

for dt, st in list_dt_st:
    for n in list_n:
        for d in list_d:
            for s0_factor in list_s0_factor:
                for gt in list_gt:
                    for should_std in list_should_std:
                        with open('datasets/d_result_' + str(n) + '_' + str(d) + '_' + str(s0_factor) + '_' + str(gt) + '_' + str(should_std) + '.pickle', 'rb') as handle:
                            d_result_local = pickle.load(handle)
                            d_result.update(d_result_local)

In [5]:
for dt, st in list_dt_st:
    for n in list_n:
        for d in list_d:
            for s0_factor in list_s0_factor:
                for gt in list_gt:
                    for should_std in list_should_std:
                        
                        fdr, tpr, fpr, shd, nnz = [], [], [], [], []
                        for tn in range(n_trials):
                            key = (n, d, s0_factor, gt, should_std, tn, 'X')
                            value = d_result[key]
                            if value[0] != '-':
                                fdr.append(value[0])
                            if value[1] != '-':
                                tpr.append(value[1])
                            if value[2] != '-':
                                fpr.append(value[2])
                            if value[3] != '-':
                                shd.append(value[3])
                            if value[4] != '-':
                                nnz.append(value[4])
                        nvalid = len(tpr)
                        assert nvalid == len(fdr) == len(fpr) == len(shd) == len(nnz)
                            
                        fdr2, tpr2, fpr2, shd2, nnz2 = [], [], [], [], []
                        for tn in range(n_trials):
                            key = (n, d, s0_factor, gt, should_std, tn, 'X_fake')
                            value = d_result[key]
                            if value[0] != '-':
                                fdr2.append(value[0])
                            if value[1] != '-':
                                tpr2.append(value[1])
                            if value[2] != '-':
                                fpr2.append(value[2])
                            if value[3] != '-':
                                shd2.append(value[3])
                            if value[4] != '-':
                                nnz2.append(value[4])      
                        nvalid2 = len(tpr2)
                        assert nvalid2 == len(fdr2) == len(fpr2) == len(shd2) == len(nnz2)
                                
                          
                        # print("{} ==> {:0.2f} ± {:0.2f} vs {:0.2f} ± {:0.2f}".format(key[:-2], np.mean(fdr), np.std(fdr), np.mean(fdr2), np.std(fdr2)))
                        # which_is_better(np.mean(fdr), np.std(fdr), nvalid, np.mean(fdr2), np.std(fdr2), nvalid2)                                                
                        print("{} ==> {:0.2f} ± {:0.2f} vs {:0.2f} ± {:0.2f}".format(key[:-2], np.mean(tpr), np.std(tpr), np.mean(tpr2), np.std(tpr2)))
                        which_is_better(np.mean(tpr), np.std(tpr), nvalid, np.mean(tpr2), np.std(tpr2), nvalid2)
                        # print("{} ==> {:0.2f} ± {:0.2f} vs {:0.2f} ± {:0.2f}".format(key[:-2], np.mean(fpr), np.std(fpr), np.mean(fpr2), np.std(fpr2)))
                        # which_is_better(np.mean(fpr), np.std(fpr), nvalid, np.mean(fpr2), np.std(fpr2), nvalid2)                        
                        print("{} ==> {:0.2f} ± {:0.2f} vs {:0.2f} ± {:0.2f}".format(key[:-2], np.mean(shd), np.std(shd), np.mean(shd2), np.std(shd2)))  
                        which_is_better(np.mean(shd), np.std(shd), nvalid, np.mean(shd2), np.std(shd2), nvalid2)                    
                        print("{} ==> {:0.2f} ± {:0.2f} vs {:0.2f} ± {:0.2f}".format(key[:-2], np.mean(nnz), np.std(nnz), np.mean(nnz2), np.std(nnz2)))   
                        print(key[:-2], nvalid, nvalid2)
                        print()
                        print()

(200, 20, 1, 'ER', False) ==> 0.17 ± 0.10 vs 0.17 ± 0.08
tstat =  -0.17931690777662912
pvalue =  0.8584155623643861
Cannot reject null: no confidence which one is better
(200, 20, 1, 'ER', False) ==> 155.59 ± 7.29 vs 175.27 ± 4.22
tstat =  -12.08071952108361
pvalue =  3.013736808251381e-15
Null rejected: more confident that mean(M1) < mean(M2)
(200, 20, 1, 'ER', False) ==> 154.85 ± 8.28 vs 176.88 ± 4.56
(200, 20, 1, 'ER', False) 27 26


(200, 20, 1, 'ER', True) ==> 0.47 ± 0.14 vs 0.49 ± 0.13
tstat =  -0.8297291686595111
pvalue =  0.4092076101058688
Cannot reject null: no confidence which one is better
(200, 20, 1, 'ER', True) ==> 151.33 ± 9.93 vs 139.03 ± 17.22
tstat =  3.8712642011318334
pvalue =  0.0002789067133925325
Null rejected: more confident that mean(M1) > mean(M2)
(200, 20, 1, 'ER', True) ==> 154.02 ± 10.68 vs 145.24 ± 19.78
(200, 20, 1, 'ER', True) 43 38


(200, 20, 1, 'SF', False) ==> 0.13 ± 0.05 vs 0.16 ± 0.06
tstat =  -1.4530197704315686
pvalue =  0.15231130296638018
Cann