In [1]:
from sklearn.metrics import f1_score
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

import asag_system.constants as c
from asag_system.models import (
    DistilBertTripletTokenizer,
    MostFrequentBaseline,
    compute_metrics,
    label_map,
)
from asag_system.datasets import TripletClassificationDataset

In [2]:
dataset = load_dataset("Atomi/semeval_2013_task_7_beetle_5way")
dev = dataset["train"]
split = dev.train_test_split(test_size=0.2, seed=42)
train = split["train"]
val = split["test"]
train, val

(Dataset({
     features: ['question_id', 'question', 'question_qtype', 'question_module', 'question_stype', 'reference_answer', 'reference_answer_quality', 'student_answer', 'label_5way', 'test_set'],
     num_rows: 8536
 }),
 Dataset({
     features: ['question_id', 'question', 'question_qtype', 'question_module', 'question_stype', 'reference_answer', 'reference_answer_quality', 'student_answer', 'label_5way', 'test_set'],
     num_rows: 2134
 }))

In [3]:
tokenizer = DistilBertTripletTokenizer()
train_dataset = TripletClassificationDataset(train, tokenizer)
val_dataset = TripletClassificationDataset(val, tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer.tokenizer)
model = MostFrequentBaseline()



In [4]:
training_args = TrainingArguments(
    output_dir=c.DATA_DIR / "results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [5]:
trainer.evaluate()

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 1.5097997188568115,
 'eval_macro_f1': 0.11326839099764865,
 'eval_accuracy': 0.3950328022492971,
 'eval_runtime': 2.6645,
 'eval_samples_per_second': 800.904,
 'eval_steps_per_second': 50.291}

### Sanity Check Metrics

Check that the macro f1 score calculated using compute_metric is correct

In [6]:
val_df = val.to_pandas()
val_df["label"] = val_df["label_5way"].apply(lambda x: label_map[x])
val_df["baseline_pred"] = 0
val_df[["label", "baseline_pred"]]
macro_f1 = f1_score(val_df["label"], val_df["baseline_pred"], average="macro")
macro_f1

0.11326839099764865

In [8]:
accuracy = (val_df["label"] == val_df["baseline_pred"]).mean()
accuracy

0.3950328022492971