In [1]:
import os
from collections import defaultdict
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mlp

FONTDICT = {
    'family': 'serif', 
    'color': 'black', 
    'weight': 'normal', 
    'size': 12
}

In [2]:
def get_results(dataset, models):

    valid_models = set(models)
    results = defaultdict(list)

    for model in valid_models.copy():

        exp_dir = f'./results/{dataset}/{model}'
        for run in os.listdir(exp_dir):
            run_dir = f'{exp_dir}/{run}'
            if os.path.isdir(run_dir):
                results[model].append(dict())
                for split in ('training', 'validation', 'testing'):
                    fn = f'{run_dir}/{split}_results.pkl'
                    if os.path.isfile(fn):
                        with open(fn, 'rb') as f:
                            results[model][-1][split] = dict(pickle.load(f))
                if not results[model][-1]:
                    results[model].pop()
    
    return dict(results), tuple(valid_models)

def loss(data, idxs, jump):
    ys = list()
    for idx in idxs:
        loss = sum(data['loss'][idx-jump:idx])
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        ys.append(loss / (tp+tn+fp+fn))
    return ys

def accuracy(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, tn, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['tn'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        acc = (tp+tn) / (tp+tn+fp+fn)
        ys.append(acc)
    return ys

def f1_score(data, idxs, jump):
    ys = list()
    for idx in idxs:
        tp, fp, fn = (
            sum(data['tp'][idx-jump:idx]),
            sum(data['fp'][idx-jump:idx]),
            sum(data['fn'][idx-jump:idx]),
        )
        f1 = (tp+tp) / (tp+tp+fp+fn)
        ys.append(f1)
    return ys

fn_map = {
    'Loss': loss,
    'Accuracy': accuracy,
    'F1-Score': f1_score
}

models = ['ALBERT', 'BERT', 'BERTweet', 'CT-BERT', 'DistilBERT', 'Longformer', 'RoBERTa', 'Twitter-RoBERTa', 'XLM', 'XLM-RoBERTa', 'XLNet',]
pairs = (
    ('CT-BERT', 'Twitter-RoBERTa'), 
    ('CT-BERT-NT', 'Twitter-RoBERTa-NT'), 
    ('CT-BERT', 'CT-BERT-NT'), 
    ('Twitter-RoBERTa', 'Twitter-RoBERTa-NT'),
)

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from model_classes import modelclass_map
from data_classes.preprocess import PreProcessor

preprocessor = PreProcessor()
parameters = dict()

for model_name in os.listdir('config/models'):
    if model_name.startswith('bart'): continue
    with open(f'config/models/{model_name}', 'r') as f:
        hf_path = f.readline().split("'")[1]
        f.readline(); save_name = f.readline().split("'")[1]
        tokenizer = AutoTokenizer.from_pretrained(hf_path)
        tokenizer.add_tokens(preprocessor.TOKENS)
        base_model = AutoModelForSequenceClassification.from_pretrained(hf_path)
        model = modelclass_map(model_name.rstrip('.yaml'))(
            base_model=base_model,
            emb_table_size=len(tokenizer),
            dense_size=18
        )
        parameters[save_name] = model.n_params

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

In [6]:
for k, v in parameters.items():
    print(k.ljust(15, ' '), f'{v[0]/10**6:.3f}M'.rjust(8, " "))

ALBERT           11.686M
BERT            109.488M
BERTweet        134.905M
CT-BERT         335.149M
DistilBERT       66.959M
Longformer      148.665M
RoBERTa         124.651M
Twitter-RoBERTa 124.651M
XLM             667.103M
XLM-RoBERTa     278.049M
XLNet           117.314M


In [7]:
datasets = (
    'aaai-constraint-covid',
    'aaai-constraint-covid-appended',
)

index = sorted(models)
metric = 'Accuracy'
jump = 1

# dataframe = defaultdict(lambda: defaultdict(list))
dataframe = defaultdict(list)

for dataset in datasets:
    col_name = dataset.replace('aaai-constraint-covid', 'acc')
    results, _ = get_results(dataset, index)
    for model in index:
        scores = list()
        for run in results[model]:
            val_data = run['validation']
            idxs = list(range(jump, len(val_data['batch'])+jump, jump))
            val_metrics = fn_map[metric](val_data, idxs, jump)
            best_val_metric = max(val_metrics)
            test_data = run['testing']
            test_metrics = fn_map[metric](test_data, idxs, jump)
            best_test_metric = 0.
            for val_metric, test_metric in zip(val_metrics, test_metrics):
                if val_metric == best_val_metric:
                    best_test_metric = max(test_metric, best_test_metric)
            scores.append(best_test_metric)
        dataframe[col_name].append((np.mean(scores), np.std(scores)))
        # dataframe[col_name][model] = scores
        # dataframe[col_name].extend(scores)

df = pd.DataFrame(
    # dict({dataset: values for dataset, values in dataframe.items()}),
    # index=[y for x in ([model, *('' for _ in range(4))] for model in index) for y in x]
    dict(
        **{
            dataset: [f'{100*(1-mean):.3f} \pm {100*std:.3f}' for (mean, std) in values]
            for dataset, values in dataframe.items()
        }, 
        improvement = [100*(y[0]-x[0])/(1-x[0]) for x, y in zip(dataframe['acc'], dataframe['acc-appended'])]
    ),
    index=index
)

df.style.set_properties(**{'text-align': 'right'})

Unnamed: 0,acc,acc-appended,improvement
ALBERT,3.374 \pm 0.157,2.991 \pm 0.107,11.357341
BERT,3.215 \pm 0.197,2.748 \pm 0.100,14.534884
BERTweet,2.869 \pm 0.063,2.607 \pm 0.130,9.120521
CT-BERT,2.111 \pm 0.128,1.815 \pm 0.078,14.02214
DistilBERT,3.084 \pm 0.170,2.776 \pm 0.063,10.0
Longformer,3.402 \pm 0.190,2.888 \pm 0.080,15.10989
RoBERTa,3.196 \pm 0.224,2.804 \pm 0.221,12.280702
Twitter-RoBERTa,2.944 \pm 0.042,2.720 \pm 0.108,7.619048
XLM,3.299 \pm 0.308,2.710 \pm 0.139,17.847025
XLM-RoBERTa,2.869 \pm 0.096,2.570 \pm 0.122,10.423453


In [8]:
from itertools import combinations

def perc_change(v1, v2):
    return round(100 * (v1-v2) / v2, 3)

print('base model_j better than base model_i, but\n\tmodel_i with linguistic features better than model_j with linguistic features'.upper())
improvements = list()
for i, j in combinations(range(len(index)), 2):
    if (dataframe['acc'][i][0] < dataframe['acc'][j][0]) and (dataframe['acc-appended'][i][0] > dataframe['acc-appended'][j][0]):
        improvements.append((i, j))
    elif (dataframe['acc'][i][0] > dataframe['acc'][j][0]) and (dataframe['acc-appended'][i][0] < dataframe['acc-appended'][j][0]):
        improvements.append((j, i))
for k, (i, j) in enumerate(improvements, 1):
    if parameters[index[i]][0] > parameters[index[j]][0]:
        continue
    print(f"{k}. {index[i]} > {index[j]}:",
        f"\n\t{perc_change(1-dataframe['acc'][j][0], 1-dataframe['acc'][i][0])}% ->",
        f"+{perc_change(1-dataframe['acc-appended'][j][0], 1-dataframe['acc-appended'][i][0])}%",
        f"\n\t{parameters[index[j]][0]/parameters[index[i]][0]:.2f} times smaller model"
    )

print('\nbase model_j better than base model_i, but\n\tmodel_i with linguistic features better than base model_j'.upper())
improvements = list()
for i, j in combinations(range(len(index)), 2):
    if dataframe['acc'][i][0] < dataframe['acc'][j][0] < dataframe['acc-appended'][i][0]:
        improvements.append((i, j))
    elif dataframe['acc'][j][0] < dataframe['acc'][i][0] < dataframe['acc-appended'][j][0]:
        improvements.append((j, i))
for k, (i, j) in enumerate(improvements, 1):
    if parameters[index[i]][0] > parameters[index[j]][0]:
        continue
    print(f"{k}. {index[i]} > {index[j]}:".ljust(34, " "),
        f"{perc_change(1-dataframe['acc'][j][0], 1-dataframe['acc'][i][0])}%".ljust(8, " "),
        f"-> +{perc_change(1-dataframe['acc'][j][0], 1-dataframe['acc-appended'][i][0])}%".ljust(11, " "),
        f"with model size ratio {parameters[index[j]][0]/parameters[index[i]][0]:.2f}"
    )

BASE MODEL_J BETTER THAN BASE MODEL_I, BUT
	MODEL_I WITH LINGUISTIC FEATURES BETTER THAN MODEL_J WITH LINGUISTIC FEATURES
2. ALBERT > XLNet: 
	-0.554% -> +3.438% 
	10.04 times smaller model
4. BERT > RoBERTa: 
	-0.581% -> +2.041% 
	1.14 times smaller model

BASE MODEL_J BETTER THAN BASE MODEL_I, BUT
	MODEL_I WITH LINGUISTIC FEATURES BETTER THAN BASE MODEL_J
1. ALBERT > BERT:                  -4.709%  -> +7.5%    with model size ratio 9.37
2. ALBERT > DistilBERT:            -8.587%  -> +3.125%  with model size ratio 5.73
4. ALBERT > RoBERTa:               -5.263%  -> +6.875%  with model size ratio 10.67
5. ALBERT > XLM:                   -2.216%  -> +10.313% with model size ratio 57.09
6. ALBERT > XLNet:                 -0.554%  -> +12.187% with model size ratio 10.04
7. BERT > BERTweet:                -10.756% -> +4.422%  with model size ratio 1.23
10. BERT > RoBERTa:                -0.581%  -> +16.327% with model size ratio 1.14
11. BERT > Twitter-RoBERTa:        -8.43%   -> +7.143%  

In [9]:
test_accs = list()

for dataset in datasets:
    results, _ = get_results(dataset, index)
    test_accs.append(list())
    for model in index:
        scores = list()
        for run in results[model]:
            val_data = run['validation']
            idxs = list(range(jump, len(val_data['batch'])+jump, jump))
            val_metrics = fn_map[metric](val_data, idxs, jump)
            best_val_metric = max(val_metrics)
            test_data = run['testing']
            test_metrics = fn_map[metric](test_data, idxs, jump)
            best_test_metric = 0.
            for val_metric, test_metric in zip(val_metrics, test_metrics):
                if val_metric == best_val_metric:
                    best_test_metric = max(test_metric, best_test_metric)
            scores.append(best_test_metric)
        test_accs[-1].append(scores)

bplot_data = [None] * (2*len(index))
bplot_data[::2] = test_accs[0]
bplot_data[1::2] = test_accs[1]

In [10]:
fig, axs = plt.subplots(1, 1, figsize=(20, 8))
bplot = axs.boxplot(bplot_data, patch_artist=True)
colors = ['lightblue', 'lightgreen']
for i, patch in enumerate(bplot['boxes']):
    patch.set_facecolor(colors[i%2])
fontdict = {'family':'serif', 'color':'black', 'weight':'normal', 'size':14}
axs.set_ylabel('Accuracy (%)', fontdict=fontdict, labelpad=12)
plt.xticks(ticks=np.arange(2.5, 23, 2), labels=index, rotation=15, **fontdict)
dx = -45/72; dy = 0/72
offset = mlp.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
for label in axs.xaxis.get_majorticklabels():
    label.set_transform(label.get_transform() + offset)
axs.grid(which='major', color='#AAAAAA', linewidth=0.8)
acc = mlp.patches.Patch(color='lightblue', label='AAAI-Constraint')
acc_a = mlp.patches.Patch(color='lightgreen', label='AAAI-Constraint-Appended')
plt.legend(handles=[acc, acc_a], prop={'family':'serif', 'size': 14})
plt.show()

: 