In [1]:
import os
from collections import defaultdict
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

FONTDICT = {
    'family': 'serif', 
    'color': 'black', 
    'weight': 'normal', 
    'size': 12
}

In [2]:
def get_results(dataset, pairs):

    valid_pairs = pairs

    results = defaultdict(list)
    for model in set((model for pair in valid_pairs for model in pair)):
        exp_dir = f'./results/{dataset}/{model}'
        if os.path.isdir(exp_dir):
            for run in os.listdir(exp_dir):
                results[model].append(dict())
                for split in ('training', 'validation'):
                    fn = f'{exp_dir}/{run}/{split}_results.pkl'
                    with open(fn, 'rb') as f:
                        results[model][-1][split] = dict(pickle.load(f))
        else:
            valid_pairs = tuple((pair for pair in valid_pairs if model not in pair))

    return dict(results), valid_pairs

def loss(data, idxs, jump):
    ys = list()
    for idx in idxs:
        loss = sum(data['loss'][idx-jump:idx])
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        ys.append(loss / (tp+tn+fp+fn))
    return ys

def accuracy(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        acc = (tp+tn) / (tp+tn+fp+fn)
        ys.append(acc)
    return ys

def f1_score(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        f1 = (tp+tp) / (tp+tp+fp+fn)
        ys.append(f1)
    return ys

fn_map = {
    'Loss': loss,
    'Accuracy': accuracy,
    'F1-Score': f1_score
}

pairs = (
    ('CT-BERT', 'Twitter-RoBERTa'), 
    ('CT-BERT-NT', 'Twitter-RoBERTa-NT'), 
    ('CT-BERT', 'CT-BERT-NT'), 
    ('Twitter-RoBERTa', 'Twitter-RoBERTa-NT'),
)

In [7]:
def pairwise_exps(dataset, results, pairs, split, metric, jump, sep='/'):

    for k1, k2 in pairs:

        plt.figure(figsize=(6, 4))
        plt.xlabel('Batches Trained', fontdict=FONTDICT, labelpad=12)
        plt.ylabel(metric, fontdict=FONTDICT, labelpad=12)
        if metric.lower() == 'loss':
            plt.ticklabel_format(axis='both', style='sci', scilimits=(0, 0))
        else:
            plt.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))

        data = results[k1][split]
        idxs = list(range(jump, len(data['batch'])+jump, jump))
        xs = [data['batch'][min(idx, len(data['batch']))-1] for idx in idxs]
        ys = fn_map[metric](data, idxs, jump)
        plt.plot(xs, ys, label=k1)

        data = results[k2][split]
        ys = fn_map[metric](data, idxs, jump)
        plt.plot(xs, ys, label=k2)

        plt.legend()
        plt.grid()

        fn = f'./assets/{dataset}{sep}{k1}_{k2}{sep}{split}{sep}{metric.lower()}.png'
        if not os.path.exists(os.path.dirname(fn)):
            os.makedirs(os.path.dirname(fn))
        plt.savefig(fn, bbox_inches='tight')
        plt.close()

In [12]:
dataset = 'aaai-constraint-covid'
jump = 50

results, valid_pairs = get_results(dataset, pairs)

for split in ('training', 'validation'):
    for metric in ('Loss', 'Accuracy', 'F1-Score'):
        pairwise_exps(dataset, results, valid_pairs, split, metric, jump if split == 'training' else 1, sep='/')

In [3]:
datasets = ('aaai-constraint-covid', 'aaai-constraint-covid-appended', 'aaai-constraint-covid-filtered', 'aaai-constraint-covid-filtered-appended')
index = sorted(set(model for pair in pairs for model in pair))
metric = 'Accuracy'

dataframe = defaultdict(list)

for dataset in datasets:
    col_name = dataset.replace('aaai-constraint-covid', 'acc')
    results, _ = get_results(dataset, pairs)
    for model in index:
        if model in results:
            scores = list()
            for run in results[model]:
                data = run['validation']
                idxs = list(range(1, len(data['batch'])+1, 1))
                ys = fn_map[metric](data, idxs, 1)
                scores.append(max(ys))
            dataframe[col_name].append(f'{np.mean(scores):.3f} +- {np.std(scores):.3f}')
        else:
            dataframe[col_name].append('-')

pd.DataFrame(dataframe, index=index).style.set_properties(**{'text-align': 'right'})

Unnamed: 0,acc,acc-appended,acc-filtered,acc-filtered-appended
CT-BERT,0.981 +- 0.001,0.982 +- 0.001,0.970 +- 0.004,0.970 +- 0.002
CT-BERT-NT,0.977 +- 0.001,0.977 +- 0.002,0.962 +- 0.006,0.968 +- 0.002
Twitter-RoBERTa,0.973 +- 0.001,0.971 +- 0.001,0.957 +- 0.007,0.958 +- 0.005
Twitter-RoBERTa-NT,0.969 +- 0.001,0.969 +- 0.001,0.956 +- 0.005,0.955 +- 0.005
