In [33]:
import os
import glob
import pdb
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from scipy import stats

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = False
# print(all_df['is_change_apply_to_test'])

all_df = all_df[all_df['is_change_apply_to_test'] == is_change_apply_to_test]
all_df = all_df[all_df['char_freq_range'] == 0]

all_semantic_change = set(all_df['semantic_change'].values)

table_relative_acc = np.zeros((len(all_semantic_change), len(all_semantic_change)), dtype=np.object)
table_p_value = np.zeros((len(all_semantic_change), len(all_semantic_change)), dtype=np.object)

row_semantic_changes = []
col_semantic_changes = []

for i, (semantic_change1, semantic_change_df1) in enumerate(all_df.groupby('semantic_change')):
    row_semantic_changes.append(semantic_change1)
    for j, (semantic_change2, semantic_change_df2) in enumerate(all_df.groupby('semantic_change')):
            if semantic_change1 == semantic_change2:
                continue
            else:
                if table_relative_acc[j, i] != 0:
                    continue
                
                test_acc1 = semantic_change_df1['test_acc'].values
                test_acc2 = semantic_change_df2['test_acc'].values
                
                t, p_value = stats.ttest_rel(test_acc1, test_acc2)
                
                avg_acc1 = np.average(test_acc1)
                avg_acc2 = np.average(test_acc2)
                
                if avg_acc1 - avg_acc2 > 0:
                    denominator = avg_acc2
                else:
                    denominator = avg_acc1
                avg_acc_gap = (avg_acc1 - avg_acc2) / denominator
                
                cell_value = f'{avg_acc1:.3f}/{avg_acc2:.3f}/{avg_acc_gap*100:.3f}%'
                if p_value < 0.001:
                    cell_value += '‡'
                elif p_value < 0.05:
                    cell_value += '†'
                else:
                    pass
                table_relative_acc[i, j] = cell_value

df = pd.DataFrame(table_relative_acc)
df.columns = row_semantic_changes
df.index = row_semantic_changes
# print(df)
display(HTML(df.to_html()))

    

Unnamed: 0,['None'],['char_deduplicate'],['reorder_freq_high2low'],['reorder_freq_low2high'],"['reorder_shuffle', 'char_deduplicate']",['reorder_shuffle']
['None'],0,0.963/0.492/95.556%‡,0.963/0.493/95.275%‡,0.963/0.493/95.368%‡,0.963/0.493/95.509%‡,0.963/0.625/54.127%†
['char_deduplicate'],0,0,0.492/0.493/-0.144%,0.492/0.493/-0.096%,0.492/0.493/-0.024%,0.492/0.625/-26.880%†
['reorder_freq_high2low'],0,0,0,0.493/0.493/0.048%,0.493/0.493/0.120%,0.493/0.625/-26.697%†
['reorder_freq_low2high'],0,0,0,0,0.493/0.493/0.072%,0.493/0.625/-26.758%†
"['reorder_shuffle', 'char_deduplicate']",0,0,0,0,0,0.493/0.625/-26.849%†
['reorder_shuffle'],0,0,0,0,0,0


In [42]:
# pre-train vs non pre-train

import os
import glob
import pdb
import numpy as np
import collections

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True

all_df = all_df[(all_df['is_change_apply_to_test'] == is_change_apply_to_test) & (all_df['char_freq_range'] == 0)]
pretrain_df = all_df[all_df['classifier_name']=='cn_roberta'] # filter non-pretrain models
no_pretrain_df = all_df[all_df['classifier_name']=='cn_roberta_no_pretrain'] # filter non-pretrain models

pretrain_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in pretrain_df.groupby('semantic_change'):
    pretrain_result[semantic_change].extend(tmp_df['test_acc'].values)

no_pretrain_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in no_pretrain_df.groupby('semantic_change'):
    no_pretrain_result[semantic_change].extend(tmp_df['test_acc'].values)
    
print(f"pretrain_df: {pretrain_df.shape}")
print(f"no_pretrain_df: {no_pretrain_df.shape}")

print("-------------------------------PRETRAIN RESULT-------------------------------")
for k,v in pretrain_result.items():
    print(f"{k}, avg: {np.average(v)}, std: {np.std(v)}   |   {v}")
# print(dict(pretrain_result))

print("-------------------------------NON PRETRAIN RESULT-------------------------------")
for k,v in no_pretrain_result.items():
    print(f"{k}, avg: {np.average(v)}, std: {np.std(v)}   |   {v}")

pretrain_df: (6, 13)
no_pretrain_df: (0, 13)
-------------------------------PRETRAIN RESULT-------------------------------
['None'], avg: 0.9629761059853325, std: 0.002359822254939643   |   [0.9662881476224272, 0.9616749467707594, 0.9609652235628106]
['char_deduplicate'], avg: 0.934232316063402, std: 0.01306203526233803   |   [0.947835344215756, 0.9382540809084458, 0.9166075230660042]
-------------------------------NON PRETRAIN RESULT-------------------------------


In [47]:
# apply_to_test vs not apply_to_test

# pre-train vs non pre-train

import os
import glob
import pdb
import numpy as np
import collections

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True

all_df = all_df[(all_df['classifier_name']=='cn_roberta') & (all_df['char_freq_range'] == 0)]

apply_totest_df = all_df[all_df['is_change_apply_to_test']==True] # filter non-pretrain models
no_apply_totest_df = all_df[(all_df['is_change_apply_to_test']==False) & (all_df['semantic_change']!="['None']")] # filter non-pretrain models

apply_totest_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in apply_totest_df.groupby('semantic_change'):
    apply_totest_result[semantic_change].extend(tmp_df['test_acc'].values)

no_apply_totest_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in no_apply_totest_df.groupby('semantic_change'):
    no_apply_totest_result[semantic_change].extend(tmp_df['test_acc'].values)
    
print(f"apply_totest_df: {apply_totest_df.shape}")
print(f"no_apply_totest_df: {no_apply_totest_df.shape}")

print("-------------------------------APPLY TO TEST RESULT-------------------------------")
for k,v in apply_totest_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")

print("-------------------------------NON APPLY TO TEST RESULT-------------------------------")
for k,v in no_apply_totest_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")

apply_totest_df: (6, 13)
no_apply_totest_df: (15, 13)
-------------------------------APPLY TO TEST RESULT-------------------------------
['None'], avg: 0.963±0.002   |   [0.9662881476224272, 0.9616749467707594, 0.9609652235628106]
['char_deduplicate'], avg: 0.934±0.013   |   [0.947835344215756, 0.9382540809084458, 0.9166075230660042]
-------------------------------NON APPLY TO TEST RESULT-------------------------------
['char_deduplicate'], avg: 0.492±0.005   |   [0.4964513839602555, 0.48580553584102204, 0.4950319375443577]
['reorder_freq_high2low'], avg: 0.493±0.004   |   [0.4971611071682044, 0.4872249822569198, 0.4950319375443577]
['reorder_freq_low2high'], avg: 0.493±0.005   |   [0.4975159687721789, 0.4861603974449965, 0.4950319375443577]
['reorder_shuffle', 'char_deduplicate'], avg: 0.493±0.005   |   [0.49680624556423003, 0.48580553584102204, 0.4950319375443577]
['reorder_shuffle'], avg: 0.625±0.042   |   [0.6710432931156849, 0.6330731014904187, 0.5702625975869411]
