### load test dataset

In [1]:
import matplotlib.pyplot as plt
import os

import dynasent_utils as utils

In [2]:
plt.style.use("dynasent.mplstyle")

In [3]:
src_dirname = "dynasent-v1.1"

filename_template = os.path.join(
    src_dirname, "dynasent-v1.1-round02-dynabench-{}.jsonl")

test_filename = filename_template.format("test")
test = utils.load_dataset(test_filename)
test[0]

{'hit_ids': ['y21512', 'y21524'],
 'sentence': 'The art exhibit has a lot to offer.',
 'sentence_author': 'w262',
 'has_prompt': True,
 'prompt_data': {'indices_into_review_text': [242, 356],
  'review_rating': 5,
  'prompt_sentence': "They're currently under construction for a new exhibit, but there is still enough art to enjoy for around 2 hours.",
  'review_id': '0cJld_mdcScG6zZtoPEFTA'},
 'model_1_label': 'positive',
 'model_1_probs': {'negative': 0.010349077172577381,
  'positive': 0.8954706788063049,
  'neutral': 0.09418027848005295},
 'text_id': 'r2-0019256',
 'label_distribution': {'positive': ['w148', 'w358', 'w4', 'w423', 'w139'],
  'negative': [],
  'neutral': [],
  'mixed': []},
 'gold_label': 'positive'}

In [4]:
utils.get_label_distribution(test, dist_labels=False)

positive    240
neutral     240
negative    240
Total       720
Name: gold_label, dtype: int64

### apply models on the test dataset

In [6]:
import sentiment_features

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
actual_labels = [x['model_1_label'] for x in test]
generated_labels_1 = []
generated_labels_2 = []
for i in range(len(test)):
    x = test[i]
    text = x['sentence']
    scores_1 = sentiment_features.get_sentiment_1(text)
    for k, v in scores_1.items():
        if v == max(scores_1.values()):
            generated_labels_1.append(k)
    scores_2 = sentiment_features.get_sentiment_2(text)
    for k, v in scores_2.items():
        if v == max(scores_2.values()):
            generated_labels_2.append(k)

In [10]:
from sklearn.metrics import f1_score
f1_1 = f1_score(actual_labels, generated_labels_1, average='macro')
f1_2 = f1_score(actual_labels, generated_labels_2, average='macro')

print('F1 score for model 1 on test dataset:', f1_1)
print('F1 score for model 2 on test dataset:', f1_2)

F1 score for model 1 on test dataset: 0.581454636862018
F1 score for model 2 on test dataset: 0.44863894024128853
