In [1]:
import evaluate
import numpy as np
import sys
import os
from datasets import load_from_disk, disable_caching
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)
sys.path.append(os.path.abspath('../../modules'))
from experiment_1.RoBERTaEntity import RoBERTaEntity

In [2]:
# import random
# seeds = [random.randint(0, 1e9) for _ in range(5)]
# seeds

In [2]:
disable_caching()

In [3]:
num_labels = 5
id2label = {
    0: "reject",
    1: "B_supplies_A",
    2: "A_supplies_B",
    3: "ambiguous",
    4: "ownership",
}
label2id = {
    "reject": 0,
    "B_supplies_A": 1,
    "A_supplies_B": 2,
    "ambiguous": 3,
    "ownership": 4,
}
metric = evaluate.load("f1")

In [4]:
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(
    {"additional_special_tokens": ["__NE_FROM__", "__NE_TO__", "__NE_OTHER__"]}
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_classwise = f1_score(labels, predictions, average=None)

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        **{f"f1_class_{i}": score for i, score in enumerate(f1_classwise)}
    }


def model_init():
    model = RoBERTaEntity.from_pretrained(
        model_name,
        num_labels=5,
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    return model

In [6]:
ds = load_from_disk("../../datasets/ManualDataset")
ds = ds.select_columns(["masked_text", "label"])
ds = ds.rename_column("masked_text", "text")
ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [8]:
def run_experiment(seed):
    set_seed(seed)
    training_args = TrainingArguments(
        seed=seed,
        data_seed=seed,
        tf32=True,
        output_dir="logs/experiment_1_roberta",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to=[],
        save_only_model=True,
    )
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    test_results = trainer.predict(ds["test"])
    return test_results

In [9]:
seeds = [358951513, 487384551, 132591638, 359790194, 380316365]
all_results = []

for seed in seeds:
    results = run_experiment(seed)
    all_results.append(results)

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8244170546531677, 'eval_f1_micro': 0.6913875598086124, 'eval_f1_macro': 0.5863309995645378, 'eval_f1_class_0': 0.7031700288184438, 'eval_f1_class_1': 0.2926829268292683, 'eval_f1_class_2': 0.7735191637630662, 'eval_f1_class_3': 0.6461538461538462, 'eval_f1_class_4': 0.5161290322580645, 'eval_runtime': 0.3748, 'eval_samples_per_second': 1115.207, 'eval_steps_per_second': 72.035, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.574273407459259, 'eval_f1_micro': 0.7870813397129187, 'eval_f1_macro': 0.7594667403650945, 'eval_f1_class_0': 0.7966101694915254, 'eval_f1_class_1': 0.6545454545454545, 'eval_f1_class_2': 0.8156862745098039, 'eval_f1_class_3': 0.7704918032786885, 'eval_f1_class_4': 0.76, 'eval_runtime': 0.3532, 'eval_samples_per_second': 1183.323, 'eval_steps_per_second': 76.435, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5922480225563049, 'eval_f1_micro': 0.7894736842105263, 'eval_f1_macro': 0.7886024326231426, 'eval_f1_class_0': 0.762214983713355, 'eval_f1_class_1': 0.7246376811594203, 'eval_f1_class_2': 0.8206896551724138, 'eval_f1_class_3': 0.8062015503875969, 'eval_f1_class_4': 0.8292682926829268, 'eval_runtime': 0.3634, 'eval_samples_per_second': 1150.197, 'eval_steps_per_second': 74.295, 'epoch': 3.0}
{'loss': 0.8466, 'grad_norm': 14.116150856018066, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5383396148681641, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.8013391684798974, 'eval_f1_class_0': 0.8454810495626822, 'eval_f1_class_1': 0.6984126984126984, 'eval_f1_class_2': 0.8339483394833949, 'eval_f1_class_3': 0.7652173913043478, 'eval_f1_class_4': 0.8636363636363636, 'eval_runtime': 0.3667, 'eval_samples_per_second': 1139.765, 'eval_steps_per_second': 73.621, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5431382656097412, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.7980779276429735, 'eval_f1_class_0': 0.830945558739255, 'eval_f1_class_1': 0.711864406779661, 'eval_f1_class_2': 0.84, 'eval_f1_class_3': 0.7703703703703704, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.388, 'eval_samples_per_second': 1077.192, 'eval_steps_per_second': 69.579, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5742688179016113, 'eval_f1_micro': 0.8086124401913876, 'eval_f1_macro': 0.7880627047409732, 'eval_f1_class_0': 0.8099688473520249, 'eval_f1_class_1': 0.6875, 'eval_f1_class_2': 0.8436363636363636, 'eval_f1_class_3': 0.7906976744186046, 'eval_f1_class_4': 0.8085106382978723, 'eval_runtime': 0.3958, 'eval_samples_per_second': 1056.169, 'eval_steps_per_second': 68.221, 'epoch': 6.0}
{'loss': 0.2768, 'grad_norm': 6.920933246612549, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5765441656112671, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.8022037501753816, 'eval_f1_class_0': 0.8173374613003096, 'eval_f1_class_1': 0.71875, 'eval_f1_class_2': 0.8518518518518519, 'eval_f1_class_3': 0.7969924812030075, 'eval_f1_class_4': 0.8260869565217391, 'eval_runtime': 0.398, 'eval_samples_per_second': 1050.138, 'eval_steps_per_second': 67.832, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.583548903465271, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8186604445614764, 'eval_f1_class_0': 0.8398791540785498, 'eval_f1_class_1': 0.7741935483870968, 'eval_f1_class_2': 0.8528301886792453, 'eval_f1_class_3': 0.7819548872180451, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3551, 'eval_samples_per_second': 1177.278, 'eval_steps_per_second': 76.044, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6087131500244141, 'eval_f1_micro': 0.8253588516746412, 'eval_f1_macro': 0.812520062184978, 'eval_f1_class_0': 0.8308605341246291, 'eval_f1_class_1': 0.7419354838709677, 'eval_f1_class_2': 0.8484848484848485, 'eval_f1_class_3': 0.796875, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3554, 'eval_samples_per_second': 1176.042, 'eval_steps_per_second': 75.964, 'epoch': 9.0}
{'loss': 0.1183, 'grad_norm': 6.049500465393066, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6097435355186462, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8154753338434191, 'eval_f1_class_0': 0.8392857142857143, 'eval_f1_class_1': 0.7419354838709677, 'eval_f1_class_2': 0.8517110266159695, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3722, 'eval_samples_per_second': 1122.987, 'eval_steps_per_second': 72.537, 'epoch': 10.0}
{'train_runtime': 97.8219, 'train_samples_per_second': 261.598, 'train_steps_per_second': 16.356, 'train_loss': 0.3933098387718201, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8310199975967407, 'eval_f1_micro': 0.7081339712918661, 'eval_f1_macro': 0.655378347143053, 'eval_f1_class_0': 0.7226890756302521, 'eval_f1_class_1': 0.5882352941176471, 'eval_f1_class_2': 0.7538461538461538, 'eval_f1_class_3': 0.6666666666666666, 'eval_f1_class_4': 0.5454545454545454, 'eval_runtime': 0.3733, 'eval_samples_per_second': 1119.816, 'eval_steps_per_second': 72.333, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5367431044578552, 'eval_f1_micro': 0.7918660287081339, 'eval_f1_macro': 0.7745186469150069, 'eval_f1_class_0': 0.7888198757763976, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8148148148148148, 'eval_f1_class_3': 0.8115942028985508, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3452, 'eval_samples_per_second': 1210.918, 'eval_steps_per_second': 78.217, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5027338862419128, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.8023413722305197, 'eval_f1_class_0': 0.8387096774193549, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8401486988847584, 'eval_f1_class_3': 0.848, 'eval_f1_class_4': 0.8181818181818182, 'eval_runtime': 0.3451, 'eval_samples_per_second': 1211.312, 'eval_steps_per_second': 78.243, 'epoch': 3.0}
{'loss': 0.8361, 'grad_norm': 19.651443481445312, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.47393330931663513, 'eval_f1_micro': 0.8492822966507177, 'eval_f1_macro': 0.8225762680255857, 'eval_f1_class_0': 0.8514285714285714, 'eval_f1_class_1': 0.6896551724137931, 'eval_f1_class_2': 0.8821292775665399, 'eval_f1_class_3': 0.8524590163934426, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.3435, 'eval_samples_per_second': 1216.74, 'eval_steps_per_second': 78.593, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5419448614120483, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7800852769454958, 'eval_f1_class_0': 0.8295454545454546, 'eval_f1_class_1': 0.6440677966101694, 'eval_f1_class_2': 0.8294573643410853, 'eval_f1_class_3': 0.828125, 'eval_f1_class_4': 0.7692307692307693, 'eval_runtime': 0.3476, 'eval_samples_per_second': 1202.41, 'eval_steps_per_second': 77.668, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5060605406761169, 'eval_f1_micro': 0.8397129186602871, 'eval_f1_macro': 0.8177551809892236, 'eval_f1_class_0': 0.8389057750759878, 'eval_f1_class_1': 0.7272727272727273, 'eval_f1_class_2': 0.8654545454545455, 'eval_f1_class_3': 0.8571428571428571, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3573, 'eval_samples_per_second': 1169.982, 'eval_steps_per_second': 75.573, 'epoch': 6.0}
{'loss': 0.2694, 'grad_norm': 29.993202209472656, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5430418848991394, 'eval_f1_micro': 0.8421052631578947, 'eval_f1_macro': 0.810291421917079, 'eval_f1_class_0': 0.8580060422960725, 'eval_f1_class_1': 0.7096774193548387, 'eval_f1_class_2': 0.8721804511278195, 'eval_f1_class_3': 0.8208955223880597, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3507, 'eval_samples_per_second': 1191.786, 'eval_steps_per_second': 76.981, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5820478796958923, 'eval_f1_micro': 0.8373205741626795, 'eval_f1_macro': 0.8117713415684975, 'eval_f1_class_0': 0.844574780058651, 'eval_f1_class_1': 0.6984126984126984, 'eval_f1_class_2': 0.8625954198473282, 'eval_f1_class_3': 0.84375, 'eval_f1_class_4': 0.8095238095238095, 'eval_runtime': 0.3563, 'eval_samples_per_second': 1173.317, 'eval_steps_per_second': 75.788, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.589146614074707, 'eval_f1_micro': 0.8397129186602871, 'eval_f1_macro': 0.8155532145743878, 'eval_f1_class_0': 0.8477611940298507, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8636363636363636, 'eval_f1_class_3': 0.8372093023255814, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3691, 'eval_samples_per_second': 1132.464, 'eval_steps_per_second': 73.15, 'epoch': 9.0}
{'loss': 0.1092, 'grad_norm': 9.144938468933105, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6146237254142761, 'eval_f1_micro': 0.8373205741626795, 'eval_f1_macro': 0.8169303345734334, 'eval_f1_class_0': 0.8470588235294118, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8549618320610687, 'eval_f1_class_3': 0.8346456692913385, 'eval_f1_class_4': 0.8095238095238095, 'eval_runtime': 0.3478, 'eval_samples_per_second': 1201.918, 'eval_steps_per_second': 77.636, 'epoch': 10.0}
{'train_runtime': 94.8108, 'train_samples_per_second': 269.906, 'train_steps_per_second': 16.876, 'train_loss': 0.38452482789754866, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.9017341136932373, 'eval_f1_micro': 0.6626794258373205, 'eval_f1_macro': 0.5784416482066235, 'eval_f1_class_0': 0.683076923076923, 'eval_f1_class_1': 0.3950617283950617, 'eval_f1_class_2': 0.7397260273972602, 'eval_f1_class_3': 0.6605504587155964, 'eval_f1_class_4': 0.41379310344827586, 'eval_runtime': 0.375, 'eval_samples_per_second': 1114.526, 'eval_steps_per_second': 71.991, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6250662207603455, 'eval_f1_micro': 0.7703349282296651, 'eval_f1_macro': 0.7273708394282556, 'eval_f1_class_0': 0.7842105263157895, 'eval_f1_class_1': 0.5714285714285714, 'eval_f1_class_2': 0.8016528925619835, 'eval_f1_class_3': 0.7768595041322314, 'eval_f1_class_4': 0.7027027027027027, 'eval_runtime': 0.3478, 'eval_samples_per_second': 1201.733, 'eval_steps_per_second': 77.624, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6159560084342957, 'eval_f1_micro': 0.777511961722488, 'eval_f1_macro': 0.7639919561068986, 'eval_f1_class_0': 0.7628205128205128, 'eval_f1_class_1': 0.6172839506172839, 'eval_f1_class_2': 0.8218181818181818, 'eval_f1_class_3': 0.8103448275862069, 'eval_f1_class_4': 0.8076923076923077, 'eval_runtime': 0.3509, 'eval_samples_per_second': 1191.061, 'eval_steps_per_second': 76.935, 'epoch': 3.0}
{'loss': 0.8599, 'grad_norm': 21.98093032836914, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5697667598724365, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.8045048481194559, 'eval_f1_class_0': 0.8136645962732919, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8475836431226765, 'eval_f1_class_3': 0.8321167883211679, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3599, 'eval_samples_per_second': 1161.505, 'eval_steps_per_second': 75.025, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5580534338951111, 'eval_f1_micro': 0.8253588516746412, 'eval_f1_macro': 0.7898622134778019, 'eval_f1_class_0': 0.8385269121813032, 'eval_f1_class_1': 0.7164179104477612, 'eval_f1_class_2': 0.8412698412698413, 'eval_f1_class_3': 0.8503937007874016, 'eval_f1_class_4': 0.7027027027027027, 'eval_runtime': 0.3472, 'eval_samples_per_second': 1203.986, 'eval_steps_per_second': 77.769, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5713363885879517, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.8080078527358481, 'eval_f1_class_0': 0.8328445747800587, 'eval_f1_class_1': 0.7058823529411765, 'eval_f1_class_2': 0.8333333333333334, 'eval_f1_class_3': 0.8346456692913385, 'eval_f1_class_4': 0.8333333333333334, 'eval_runtime': 0.373, 'eval_samples_per_second': 1120.653, 'eval_steps_per_second': 72.387, 'epoch': 6.0}
{'loss': 0.2848, 'grad_norm': 18.648193359375, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5943169593811035, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.8125613407346318, 'eval_f1_class_0': 0.825, 'eval_f1_class_1': 0.75, 'eval_f1_class_2': 0.8467153284671532, 'eval_f1_class_3': 0.8503937007874016, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3549, 'eval_samples_per_second': 1177.705, 'eval_steps_per_second': 76.072, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.606907308101654, 'eval_f1_micro': 0.8253588516746412, 'eval_f1_macro': 0.8080308816700722, 'eval_f1_class_0': 0.8328267477203647, 'eval_f1_class_1': 0.7397260273972602, 'eval_f1_class_2': 0.8396946564885496, 'eval_f1_class_3': 0.8372093023255814, 'eval_f1_class_4': 0.7906976744186046, 'eval_runtime': 0.3531, 'eval_samples_per_second': 1183.656, 'eval_steps_per_second': 76.456, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6366732120513916, 'eval_f1_micro': 0.8325358851674641, 'eval_f1_macro': 0.8234451887861869, 'eval_f1_class_0': 0.8307692307692308, 'eval_f1_class_1': 0.7428571428571429, 'eval_f1_class_2': 0.846441947565543, 'eval_f1_class_3': 0.8527131782945736, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.364, 'eval_samples_per_second': 1148.424, 'eval_steps_per_second': 74.181, 'epoch': 9.0}
{'loss': 0.1302, 'grad_norm': 1.326338529586792, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6315638422966003, 'eval_f1_micro': 0.8373205741626795, 'eval_f1_macro': 0.8308965910488227, 'eval_f1_class_0': 0.8379204892966361, 'eval_f1_class_1': 0.7536231884057971, 'eval_f1_class_2': 0.849624060150376, 'eval_f1_class_3': 0.84375, 'eval_f1_class_4': 0.8695652173913043, 'eval_runtime': 0.3506, 'eval_samples_per_second': 1192.373, 'eval_steps_per_second': 77.019, 'epoch': 10.0}
{'train_runtime': 94.8092, 'train_samples_per_second': 269.91, 'train_steps_per_second': 16.876, 'train_loss': 0.4036247903108597, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.82056725025177, 'eval_f1_micro': 0.7081339712918661, 'eval_f1_macro': 0.6771297649376516, 'eval_f1_class_0': 0.6997084548104956, 'eval_f1_class_1': 0.5625, 'eval_f1_class_2': 0.7725631768953068, 'eval_f1_class_3': 0.6666666666666666, 'eval_f1_class_4': 0.6842105263157895, 'eval_runtime': 0.3584, 'eval_samples_per_second': 1166.413, 'eval_steps_per_second': 75.342, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5836171507835388, 'eval_f1_micro': 0.777511961722488, 'eval_f1_macro': 0.7345751324971246, 'eval_f1_class_0': 0.7942028985507247, 'eval_f1_class_1': 0.625, 'eval_f1_class_2': 0.8244274809160306, 'eval_f1_class_3': 0.75, 'eval_f1_class_4': 0.6792452830188679, 'eval_runtime': 0.3475, 'eval_samples_per_second': 1203.014, 'eval_steps_per_second': 77.707, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5973231792449951, 'eval_f1_micro': 0.777511961722488, 'eval_f1_macro': 0.7594648662441986, 'eval_f1_class_0': 0.7588424437299035, 'eval_f1_class_1': 0.6461538461538462, 'eval_f1_class_2': 0.8294314381270903, 'eval_f1_class_3': 0.7543859649122807, 'eval_f1_class_4': 0.8085106382978723, 'eval_runtime': 0.3703, 'eval_samples_per_second': 1128.887, 'eval_steps_per_second': 72.919, 'epoch': 3.0}
{'loss': 0.8222, 'grad_norm': 20.330097198486328, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6219783425331116, 'eval_f1_micro': 0.7799043062200957, 'eval_f1_macro': 0.7588483950158368, 'eval_f1_class_0': 0.770764119601329, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8251748251748252, 'eval_f1_class_3': 0.768, 'eval_f1_class_4': 0.7636363636363637, 'eval_runtime': 0.3363, 'eval_samples_per_second': 1242.962, 'eval_steps_per_second': 80.287, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5858204364776611, 'eval_f1_micro': 0.8038277511961722, 'eval_f1_macro': 0.7836680334705697, 'eval_f1_class_0': 0.8037974683544303, 'eval_f1_class_1': 0.6875, 'eval_f1_class_2': 0.8413284132841329, 'eval_f1_class_3': 0.7857142857142857, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3324, 'eval_samples_per_second': 1257.421, 'eval_steps_per_second': 81.221, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6129098534584045, 'eval_f1_micro': 0.8110047846889952, 'eval_f1_macro': 0.7845101815940015, 'eval_f1_class_0': 0.8292682926829268, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8352490421455939, 'eval_f1_class_3': 0.7913669064748201, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.334, 'eval_samples_per_second': 1251.347, 'eval_steps_per_second': 80.829, 'epoch': 6.0}
{'loss': 0.2781, 'grad_norm': 19.97541618347168, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6263538002967834, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7898193640180553, 'eval_f1_class_0': 0.831858407079646, 'eval_f1_class_1': 0.6461538461538462, 'eval_f1_class_2': 0.849624060150376, 'eval_f1_class_3': 0.8032786885245902, 'eval_f1_class_4': 0.8181818181818182, 'eval_runtime': 0.3343, 'eval_samples_per_second': 1250.518, 'eval_steps_per_second': 80.775, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6276966333389282, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.7967827819244242, 'eval_f1_class_0': 0.8203592814371258, 'eval_f1_class_1': 0.6956521739130435, 'eval_f1_class_2': 0.846441947565543, 'eval_f1_class_3': 0.8032786885245902, 'eval_f1_class_4': 0.8181818181818182, 'eval_runtime': 0.3474, 'eval_samples_per_second': 1203.393, 'eval_steps_per_second': 77.731, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6523991823196411, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.810441553667309, 'eval_f1_class_0': 0.8288288288288288, 'eval_f1_class_1': 0.7246376811594203, 'eval_f1_class_2': 0.8389513108614233, 'eval_f1_class_3': 0.8225806451612904, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.3457, 'eval_samples_per_second': 1209.277, 'eval_steps_per_second': 78.111, 'epoch': 9.0}
{'loss': 0.114, 'grad_norm': 6.159750461578369, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6727039217948914, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8100492203463254, 'eval_f1_class_0': 0.844311377245509, 'eval_f1_class_1': 0.696969696969697, 'eval_f1_class_2': 0.8473282442748091, 'eval_f1_class_3': 0.8244274809160306, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.3239, 'eval_samples_per_second': 1290.535, 'eval_steps_per_second': 83.36, 'epoch': 10.0}
{'train_runtime': 92.1255, 'train_samples_per_second': 277.773, 'train_steps_per_second': 17.368, 'train_loss': 0.38340321093797686, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8703548908233643, 'eval_f1_micro': 0.6602870813397129, 'eval_f1_macro': 0.5904369945020351, 'eval_f1_class_0': 0.7012345679012346, 'eval_f1_class_1': 0.4090909090909091, 'eval_f1_class_2': 0.6581196581196581, 'eval_f1_class_3': 0.6504065040650406, 'eval_f1_class_4': 0.5333333333333333, 'eval_runtime': 0.3546, 'eval_samples_per_second': 1178.775, 'eval_steps_per_second': 76.141, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5650731921195984, 'eval_f1_micro': 0.7799043062200957, 'eval_f1_macro': 0.7497281149971429, 'eval_f1_class_0': 0.7818696883852692, 'eval_f1_class_1': 0.6896551724137931, 'eval_f1_class_2': 0.7984189723320159, 'eval_f1_class_3': 0.8120300751879699, 'eval_f1_class_4': 0.6666666666666666, 'eval_runtime': 0.3306, 'eval_samples_per_second': 1264.494, 'eval_steps_per_second': 81.678, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.544445812702179, 'eval_f1_micro': 0.7918660287081339, 'eval_f1_macro': 0.7618056314950836, 'eval_f1_class_0': 0.8109756097560976, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8185328185328186, 'eval_f1_class_3': 0.7737226277372263, 'eval_f1_class_4': 0.7391304347826086, 'eval_runtime': 0.3286, 'eval_samples_per_second': 1272.167, 'eval_steps_per_second': 82.173, 'epoch': 3.0}
{'loss': 0.8261, 'grad_norm': 18.217479705810547, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.585721492767334, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7988224712533097, 'eval_f1_class_0': 0.8165680473372781, 'eval_f1_class_1': 0.7, 'eval_f1_class_2': 0.8473282442748091, 'eval_f1_class_3': 0.8217054263565892, 'eval_f1_class_4': 0.8085106382978723, 'eval_runtime': 0.3396, 'eval_samples_per_second': 1230.786, 'eval_steps_per_second': 79.501, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5684570074081421, 'eval_f1_micro': 0.8325358851674641, 'eval_f1_macro': 0.8039699050351847, 'eval_f1_class_0': 0.84593837535014, 'eval_f1_class_1': 0.6875, 'eval_f1_class_2': 0.8571428571428571, 'eval_f1_class_3': 0.8292682926829268, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3408, 'eval_samples_per_second': 1226.432, 'eval_steps_per_second': 79.219, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5643013119697571, 'eval_f1_micro': 0.8253588516746412, 'eval_f1_macro': 0.809126676237437, 'eval_f1_class_0': 0.8303030303030303, 'eval_f1_class_1': 0.7096774193548387, 'eval_f1_class_2': 0.8487084870848709, 'eval_f1_class_3': 0.8125, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3353, 'eval_samples_per_second': 1246.51, 'eval_steps_per_second': 80.516, 'epoch': 6.0}
{'loss': 0.2547, 'grad_norm': 14.743159294128418, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.649755597114563, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.8095172426491211, 'eval_f1_class_0': 0.8184615384615385, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8401486988847584, 'eval_f1_class_3': 0.8244274809160306, 'eval_f1_class_4': 0.8260869565217391, 'eval_runtime': 0.3389, 'eval_samples_per_second': 1233.287, 'eval_steps_per_second': 79.662, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6689387559890747, 'eval_f1_micro': 0.8349282296650717, 'eval_f1_macro': 0.824638375888348, 'eval_f1_class_0': 0.8396501457725948, 'eval_f1_class_1': 0.7164179104477612, 'eval_f1_class_2': 0.8416988416988417, 'eval_f1_class_3': 0.8617886178861789, 'eval_f1_class_4': 0.8636363636363636, 'eval_runtime': 0.334, 'eval_samples_per_second': 1251.474, 'eval_steps_per_second': 80.837, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6825006604194641, 'eval_f1_micro': 0.8349282296650717, 'eval_f1_macro': 0.8226165829108402, 'eval_f1_class_0': 0.8396501457725948, 'eval_f1_class_1': 0.7076923076923077, 'eval_f1_class_2': 0.8449612403100775, 'eval_f1_class_3': 0.8571428571428571, 'eval_f1_class_4': 0.8636363636363636, 'eval_runtime': 0.3283, 'eval_samples_per_second': 1273.188, 'eval_steps_per_second': 82.239, 'epoch': 9.0}
{'loss': 0.1137, 'grad_norm': 8.375455856323242, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6699915528297424, 'eval_f1_micro': 0.8349282296650717, 'eval_f1_macro': 0.824799089455659, 'eval_f1_class_0': 0.8377581120943953, 'eval_f1_class_1': 0.7272727272727273, 'eval_f1_class_2': 0.8473282442748091, 'eval_f1_class_3': 0.848, 'eval_f1_class_4': 0.8636363636363636, 'eval_runtime': 0.3362, 'eval_samples_per_second': 1243.374, 'eval_steps_per_second': 80.314, 'epoch': 10.0}
{'train_runtime': 90.2423, 'train_samples_per_second': 283.57, 'train_steps_per_second': 17.73, 'train_loss': 0.3779454255104065, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

In [10]:
# Calculate mean and std of F1 scores
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]  # Adjust range based on num_classes

avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }

# Print results
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")

Average test_f1_micro: 0.8000 ± 0.0167
Average test_f1_macro: 0.7977 ± 0.0212
Average test_f1_class_0: 0.8011 ± 0.0215
Average test_f1_class_1: 0.8147 ± 0.0250
Average test_f1_class_2: 0.8293 ± 0.0149
Average test_f1_class_3: 0.6778 ± 0.0288
Average test_f1_class_4: 0.8655 ± 0.0403
