In [1]:
import os
from collections import defaultdict
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

FONTDICT = {
    'family': 'serif', 
    'color': 'black', 
    'weight': 'normal', 
    'size': 12
}

In [5]:
def get_results(dataset, models):
    valid_models = set(models)
    results = defaultdict(list)
    for model in valid_models:
        exp_dir = f'./results/{dataset}/{model}'
        if not os.path.isdir(exp_dir):
            valid_models = valid_models.pop(model)
            continue
        for run in os.listdir(exp_dir):
            results[model].append(dict())
            for split in ('training', 'validation'):
                fn = f'{exp_dir}/{run}/{split}_results.pkl'
                with open(fn, 'rb') as f:
                    results[model][-1][split] = dict(pickle.load(f))
    return dict(results), tuple(valid_models)

def loss(data, idxs, jump):
    ys = list()
    for idx in idxs:
        loss = sum(data['loss'][idx-jump:idx])
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        ys.append(loss / (tp+tn+fp+fn))
    return ys

def accuracy(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        acc = (tp+tn) / (tp+tn+fp+fn)
        ys.append(acc)
    return ys

def f1_score(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        f1 = (tp+tp) / (tp+tp+fp+fn)
        ys.append(f1)
    return ys

fn_map = {
    'Loss': loss,
    'Accuracy': accuracy,
    'F1-Score': f1_score
}

models = ('Twitter-RoBERTa', 'ALBERT', 'DistilBERT', 'CT-BERT')

pairs = (
    ('CT-BERT', 'Twitter-RoBERTa'), 
    ('CT-BERT-NT', 'Twitter-RoBERTa-NT'), 
    ('CT-BERT', 'CT-BERT-NT'), 
    ('Twitter-RoBERTa', 'Twitter-RoBERTa-NT'),
)

In [7]:
def pairwise_plot(dataset, results, pairs, split, metric, jump, sep='/'):

    for k1, k2 in pairs:

        plt.figure(figsize=(6, 4))
        plt.xlabel('Batches Trained', fontdict=FONTDICT, labelpad=12)
        plt.ylabel(metric, fontdict=FONTDICT, labelpad=12)
        if metric.lower() == 'loss':
            plt.ticklabel_format(axis='both', style='sci', scilimits=(0, 0))
        else:
            plt.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))

        data = results[k1][split]
        idxs = list(range(jump, len(data['batch'])+jump, jump))
        xs = [data['batch'][min(idx, len(data['batch']))-1] for idx in idxs]
        ys = fn_map[metric](data, idxs, jump)
        plt.plot(xs, ys, label=k1)

        data = results[k2][split]
        ys = fn_map[metric](data, idxs, jump)
        plt.plot(xs, ys, label=k2)

        plt.legend()
        plt.grid()

        fn = f'./assets/{dataset}{sep}{k1}_{k2}{sep}{split}{sep}{metric.lower()}.png'
        if not os.path.exists(os.path.dirname(fn)):
            os.makedirs(os.path.dirname(fn))
        plt.savefig(fn, bbox_inches='tight')
        plt.close()

In [12]:
dataset = 'aaai-constraint-covid'
jump = 50

results, valid_pairs = get_results(dataset, pairs)

for split, jump in (('training', 50), ('validation', 1)):
    for metric in ('Loss', 'Accuracy', 'F1-Score'):
        pairwise_plot(dataset, results, valid_pairs, split, metric, jump, sep='/')

In [9]:
datasets = (
    'aaai-constraint-covid', 
    'aaai-constraint-covid-appended', 
    # 'aaai-constraint-covid-cleaned', 
    # 'aaai-constraint-covid-cleaned-appended'
)

index = sorted(models + tuple(model+'-NT' for model in models))
split = 'validation'
metric = 'Accuracy'
jump = 600 if split == 'training' else 1

dataframe = defaultdict(list)

for dataset in datasets:
    col_name = dataset.replace('aaai-constraint-covid', 'acc')
    results, _ = get_results(dataset, index)
    for model in index:
        if model in results:
            scores = list()
            for run in results[model]:
                data = run[split]
                idxs = list(range(jump, len(data['batch'])+jump, jump))
                ys = fn_map[metric](data, idxs, jump)
                scores.append(max(ys))
            dataframe[col_name].append(f'{np.mean(scores):.6f} +- {np.std(scores):.6f}')
        else:
            dataframe[col_name].append('-')

pd.DataFrame(dataframe, index=index).style.set_properties(**{'text-align': 'right'})

Unnamed: 0,acc,acc-appended
ALBERT,0.966636 +- 0.002553,0.967944 +- 0.001661
ALBERT-NT,0.969346 +- 0.001951,0.969346 +- 0.003994
CT-BERT,0.980748 +- 0.000804,0.981589 +- 0.001466
CT-BERT-NT,0.976822 +- 0.000867,0.977009 +- 0.001969
DistilBERT,0.973832 +- 0.000782,0.972336 +- 0.001430
DistilBERT-NT,0.971121 +- 0.001947,0.970374 +- 0.001788
Twitter-RoBERTa,0.973364 +- 0.000887,0.971402 +- 0.000906
Twitter-RoBERTa-NT,0.969346 +- 0.000759,0.968972 +- 0.000962
