In [69]:
import os
import glob
import pdb
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from scipy import stats

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True
# print(all_df['is_change_apply_to_test'])

all_df = all_df[all_df['is_change_apply_to_test'] == is_change_apply_to_test]
all_df = all_df[all_df['char_freq_range'] == 0]
all_df = all_df[all_df['classifier_name'] == 'cn_roberta_no_pretrain']
all_df = all_df[~all_df['semantic_change'].isin({'rm_chars_in_freq', 'rm_chars_out_freq'})]



all_semantic_change = set(all_df['semantic_change'].values)

table_relative_acc = np.zeros((len(all_semantic_change), len(all_semantic_change)), dtype=np.object)
table_p_value = np.zeros((len(all_semantic_change), len(all_semantic_change)), dtype=np.object)

row_semantic_changes = []
col_semantic_changes = []

for i, (semantic_change1, semantic_change_df1) in enumerate(all_df.groupby('semantic_change')):
    row_semantic_changes.append(semantic_change1)
    for j, (semantic_change2, semantic_change_df2) in enumerate(all_df.groupby('semantic_change')):
            if semantic_change1 == semantic_change2:
                continue
            else:
                if table_relative_acc[j, i] != 0:
                    continue
                
                test_acc1 = semantic_change_df1['test_acc'].values
                test_acc2 = semantic_change_df2['test_acc'].values
                
                t, p_value = stats.ttest_rel(test_acc1, test_acc2)
                
                avg_acc1 = np.average(test_acc1)
                avg_acc2 = np.average(test_acc2)
                
                if avg_acc1 - avg_acc2 > 0:
                    denominator = avg_acc2
                else:
                    denominator = avg_acc1
                avg_acc_gap = (avg_acc1 - avg_acc2) / denominator
                
                # cell_value = f'{avg_acc1:.3f}/{avg_acc2:.3f}/{avg_acc_gap*100:.3f}%'
                cell_value = f'{avg_acc_gap*100:.3f}%'

                if p_value < 0.001:
                    cell_value += '‡'
                elif p_value < 0.05:
                    cell_value += '†'
                else:
                    pass
                table_relative_acc[i, j] = cell_value

df = pd.DataFrame(table_relative_acc)
df.columns = row_semantic_changes
df.index = row_semantic_changes
# print(df)
display(HTML(df.to_html()))

    

Unnamed: 0,['None'],['char_deduplicate'],['likelihood_rank'],['reorder_freq_high2low'],['reorder_freq_low2high'],"['reorder_shuffle', 'char_deduplicate']",['reorder_shuffle']
['None'],0,-5.589%‡,17.242%‡,2.804%‡,2.780%‡,-5.502%‡,2.997%‡
['char_deduplicate'],0,0,23.795%‡,8.550%‡,8.525%‡,0.082%,8.753%‡
['likelihood_rank'],0,0,0,-14.045%‡,-14.071%‡,-23.693%‡,-13.831%‡
['reorder_freq_high2low'],0,0,0,0,-0.023%,-8.460%‡,0.187%
['reorder_freq_low2high'],0,0,0,0,0,-8.436%‡,0.211%
"['reorder_shuffle', 'char_deduplicate']",0,0,0,0,0,0,8.664%‡
['reorder_shuffle'],0,0,0,0,0,0,0


In [66]:
# pre-train vs non pre-train

import os
import glob
import pdb
import numpy as np
import collections

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True

all_df = all_df[(all_df['is_change_apply_to_test'] == is_change_apply_to_test) & (all_df['char_freq_range'] == 0)]
pretrain_df = all_df[all_df['classifier_name']=='cn_roberta'] # filter non-pretrain models
no_pretrain_df = all_df[all_df['classifier_name']=='cn_roberta_no_pretrain'] # filter non-pretrain models

pretrain_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in pretrain_df.groupby('semantic_change'):
    pretrain_result[semantic_change].extend(tmp_df['test_acc'].values)

no_pretrain_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in no_pretrain_df.groupby('semantic_change'):
    no_pretrain_result[semantic_change].extend(tmp_df['test_acc'].values)
    
print(f"pretrain_df: {pretrain_df.shape}")
print(f"no_pretrain_df: {no_pretrain_df.shape}")

print("-------------------------------PRETRAIN RESULT-------------------------------")
for k,v in pretrain_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")
# print(dict(pretrain_result))

print("-------------------------------NON PRETRAIN RESULT-------------------------------")
for k,v in no_pretrain_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")

pretrain_df: (105, 13)
no_pretrain_df: (105, 13)
-------------------------------PRETRAIN RESULT-------------------------------
['None'], avg: 0.955±0.010   |   [0.9655784244144784, 0.9488999290276792, 0.93541518807665, 0.9446415897799858, 0.9506742370475516, 0.9517388218594748, 0.949609652235628, 0.9698367636621718, 0.9588360539389638, 0.9616749467707594, 0.9680624556422996, 0.9623846699787084, 0.9630943931866572, 0.9460610361958836, 0.9467707594038324]
['char_deduplicate'], avg: 0.934±0.010   |   [0.9336408800567778, 0.936834634492548, 0.9357700496806246, 0.9457061745919092, 0.9268985095812632, 0.9073811213626686, 0.9389638041163946, 0.9350603264726756, 0.9375443577004968, 0.9464158977998579, 0.9304471256210078, 0.9297374024130588, 0.9215755855216464, 0.9421575585521648, 0.939318665720369]
['likelihood_rank'], avg: 0.769±0.009   |   [0.769694819020582, 0.7579843860894251, 0.765791341376863, 0.7700496806245565, 0.7618878637331441, 0.7615330021291696, 0.7572746628814763, 0.7906316536550

In [68]:
# apply_to_test vs not apply_to_test

# pre-train vs non pre-train

import os
import glob
import pdb
import numpy as np
import collections

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True

all_df = all_df[(all_df['classifier_name']=='cn_roberta') & (all_df['char_freq_range'] == 0)]

apply_totest_df = all_df[all_df['is_change_apply_to_test']==True] # filter non-pretrain models
no_apply_totest_df = all_df[all_df['is_change_apply_to_test']==False] # filter non-pretrain models

apply_totest_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in apply_totest_df.groupby('semantic_change'):
    apply_totest_result[semantic_change].extend(tmp_df['test_acc'].values)

no_apply_totest_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in no_apply_totest_df.groupby('semantic_change'):
    no_apply_totest_result[semantic_change].extend(tmp_df['test_acc'].values)
    
print(f"apply_totest_df: {apply_totest_df.shape}")
print(f"no_apply_totest_df: {no_apply_totest_df.shape}")

print("-------------------------------APPLY TO TEST RESULT-------------------------------")
for k,v in apply_totest_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")

print("-------------------------------NON APPLY TO TEST RESULT-------------------------------")
for k,v in no_apply_totest_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")

apply_totest_df: (105, 13)
no_apply_totest_df: (75, 13)
-------------------------------APPLY TO TEST RESULT-------------------------------
['None'], avg: 0.955±0.010   |   [0.9655784244144784, 0.9488999290276792, 0.93541518807665, 0.9446415897799858, 0.9506742370475516, 0.9517388218594748, 0.949609652235628, 0.9698367636621718, 0.9588360539389638, 0.9616749467707594, 0.9680624556422996, 0.9623846699787084, 0.9630943931866572, 0.9460610361958836, 0.9467707594038324]
['char_deduplicate'], avg: 0.934±0.010   |   [0.9336408800567778, 0.936834634492548, 0.9357700496806246, 0.9457061745919092, 0.9268985095812632, 0.9073811213626686, 0.9389638041163946, 0.9350603264726756, 0.9375443577004968, 0.9464158977998579, 0.9304471256210078, 0.9297374024130588, 0.9215755855216464, 0.9421575585521648, 0.939318665720369]
['likelihood_rank'], avg: 0.769±0.009   |   [0.769694819020582, 0.7579843860894251, 0.765791341376863, 0.7700496806245565, 0.7618878637331441, 0.7615330021291696, 0.7572746628814763, 0.7

In [None]:
# apply_to_test vs not apply_to_test

# pre-train & corrupted on train

import os
import glob
import pdb
import numpy as np
import collections

all_files = glob.glob(os.path.join('../result/', '*.csv'))
all_df = pd.concat([pd.read_csv(x) for x in all_files])
is_change_apply_to_test = True

all_df = all_df[(all_df['classifier_name']=='cn_roberta') & (all_df['char_freq_range'] == 0)]

apply_totest_df = all_df[(all_df['is_change_apply_to_test']==True) & (all_df['is_change_apply_to_train']==False)] # filter non-pretrain models

apply_totest_result = collections.defaultdict(lambda: [])
for semantic_change, tmp_df in apply_totest_df.groupby('semantic_change'):
    apply_totest_result[semantic_change].extend(tmp_df['test_acc'].values)
    
print(f"apply_totest_df: {apply_totest_df.shape}")

print("-------------------------------APPLY TO TEST RESULT but NOT to Train result-------------------------------")
for k,v in apply_totest_result.items():
    print(f"{k}, avg: {np.average(v):.3f}±{np.std(v):.3f}   |   {v}")
