In [1]:
import evaluate
import numpy as np
from datasets import load_from_disk, disable_caching
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    RobertaForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [2]:
# import random
# seeds = [random.randint(0, 1e9) for _ in range(5)]
# seeds

In [3]:
disable_caching()

In [4]:
num_labels = 5
id2label = {
    0: "reject",
    1: "B_supplies_A",
    2: "A_supplies_B",
    3: "ambiguous",
    4: "ownership",
}
label2id = {
    "reject": 0,
    "B_supplies_A": 1,
    "A_supplies_B": 2,
    "ambiguous": 3,
    "ownership": 4,
}
metric = evaluate.load("f1")

In [5]:
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(
    {"additional_special_tokens": ["__NE_FROM__", "__NE_TO__", "__NE_OTHER__"]}
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_classwise = f1_score(labels, predictions, average=None)

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        **{f"f1_class_{i}": score for i, score in enumerate(f1_classwise)}
    }


def model_init():
    model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=5,
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    return model

In [7]:
ds = load_from_disk("../../datasets/ManualDataset")
ds = ds.select_columns(["masked_text", "label"])
ds = ds.rename_column("masked_text", "text")
ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [8]:
def run_experiment(seed):
    set_seed(seed)
    training_args = TrainingArguments(
        seed=seed,
        data_seed=seed,
        tf32=True,
        output_dir="logs/experiment_1_roberta",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to=[],
        save_only_model=True,
    )
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    test_results = trainer.predict(ds["test"])
    return test_results

In [9]:
seeds = [87429541, 201431458, 584301348, 973765745, 499095391]
all_results = []

for seed in seeds:
    results = run_experiment(seed)
    all_results.append(results)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1323809623718262, 'eval_f1_micro': 0.5933014354066986, 'eval_f1_macro': 0.40542614200648286, 'eval_f1_class_0': 0.6993318485523385, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.5294117647058824, 'eval_f1_class_3': 0.5483870967741935, 'eval_f1_class_4': 0.25, 'eval_runtime': 0.362, 'eval_samples_per_second': 1154.743, 'eval_steps_per_second': 74.589, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7653756141662598, 'eval_f1_micro': 0.6961722488038278, 'eval_f1_macro': 0.6622161646643512, 'eval_f1_class_0': 0.7098445595854922, 'eval_f1_class_1': 0.5357142857142857, 'eval_f1_class_2': 0.7232142857142857, 'eval_f1_class_3': 0.6923076923076923, 'eval_f1_class_4': 0.65, 'eval_runtime': 0.3524, 'eval_samples_per_second': 1186.052, 'eval_steps_per_second': 76.611, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6623486876487732, 'eval_f1_micro': 0.7320574162679426, 'eval_f1_macro': 0.6942753535374669, 'eval_f1_class_0': 0.7239263803680982, 'eval_f1_class_1': 0.5806451612903226, 'eval_f1_class_2': 0.7859922178988327, 'eval_f1_class_3': 0.7466666666666667, 'eval_f1_class_4': 0.6341463414634146, 'eval_runtime': 0.3435, 'eval_samples_per_second': 1216.838, 'eval_steps_per_second': 78.6, 'epoch': 3.0}
{'loss': 1.0568, 'grad_norm': 9.288031578063965, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5792719721794128, 'eval_f1_micro': 0.7799043062200957, 'eval_f1_macro': 0.7642552458146241, 'eval_f1_class_0': 0.7820895522388059, 'eval_f1_class_1': 0.5945945945945946, 'eval_f1_class_2': 0.8188976377952756, 'eval_f1_class_3': 0.78125, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3552, 'eval_samples_per_second': 1176.836, 'eval_steps_per_second': 76.016, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5719582438468933, 'eval_f1_micro': 0.784688995215311, 'eval_f1_macro': 0.7684347453446277, 'eval_f1_class_0': 0.7774647887323943, 'eval_f1_class_1': 0.6451612903225806, 'eval_f1_class_2': 0.833976833976834, 'eval_f1_class_3': 0.7563025210084033, 'eval_f1_class_4': 0.8292682926829268, 'eval_runtime': 0.3416, 'eval_samples_per_second': 1223.54, 'eval_steps_per_second': 79.032, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5715755820274353, 'eval_f1_micro': 0.784688995215311, 'eval_f1_macro': 0.7596319316828509, 'eval_f1_class_0': 0.7724550898203593, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.8603773584905661, 'eval_f1_class_3': 0.7384615384615385, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3383, 'eval_samples_per_second': 1235.698, 'eval_steps_per_second': 79.818, 'epoch': 6.0}
{'loss': 0.461, 'grad_norm': 18.63897705078125, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5812296271324158, 'eval_f1_micro': 0.7918660287081339, 'eval_f1_macro': 0.7726710541930777, 'eval_f1_class_0': 0.7917888563049853, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8515625, 'eval_f1_class_3': 0.732824427480916, 'eval_f1_class_4': 0.8205128205128205, 'eval_runtime': 0.361, 'eval_samples_per_second': 1157.912, 'eval_steps_per_second': 74.793, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6120543479919434, 'eval_f1_micro': 0.7966507177033493, 'eval_f1_macro': 0.7713458741089754, 'eval_f1_class_0': 0.797583081570997, 'eval_f1_class_1': 0.6388888888888888, 'eval_f1_class_2': 0.8669201520912547, 'eval_f1_class_3': 0.732824427480916, 'eval_f1_class_4': 0.8205128205128205, 'eval_runtime': 0.3538, 'eval_samples_per_second': 1181.577, 'eval_steps_per_second': 76.322, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6495063304901123, 'eval_f1_micro': 0.7870813397129187, 'eval_f1_macro': 0.7674492262581087, 'eval_f1_class_0': 0.7865853658536586, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.8436363636363636, 'eval_f1_class_3': 0.7301587301587301, 'eval_f1_class_4': 0.85, 'eval_runtime': 0.3427, 'eval_samples_per_second': 1219.662, 'eval_steps_per_second': 78.782, 'epoch': 9.0}
{'loss': 0.2745, 'grad_norm': 26.30982208251953, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6619570851325989, 'eval_f1_micro': 0.7966507177033493, 'eval_f1_macro': 0.783958489396924, 'eval_f1_class_0': 0.7875, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8487084870848709, 'eval_f1_class_3': 0.7669172932330827, 'eval_f1_class_4': 0.85, 'eval_runtime': 0.3582, 'eval_samples_per_second': 1167.059, 'eval_steps_per_second': 75.384, 'epoch': 10.0}
{'train_runtime': 92.9508, 'train_samples_per_second': 275.307, 'train_steps_per_second': 17.213, 'train_loss': 0.5734248125553131, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1549533605575562, 'eval_f1_micro': 0.6148325358851675, 'eval_f1_macro': 0.4340420702863238, 'eval_f1_class_0': 0.7048458149779736, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.5909090909090909, 'eval_f1_class_3': 0.5544554455445545, 'eval_f1_class_4': 0.32, 'eval_runtime': 0.3539, 'eval_samples_per_second': 1181.174, 'eval_steps_per_second': 76.296, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8513965606689453, 'eval_f1_micro': 0.6411483253588517, 'eval_f1_macro': 0.6171513336543969, 'eval_f1_class_0': 0.6428571428571429, 'eval_f1_class_1': 0.5263157894736842, 'eval_f1_class_2': 0.6891385767790262, 'eval_f1_class_3': 0.5891472868217055, 'eval_f1_class_4': 0.6382978723404256, 'eval_runtime': 0.3474, 'eval_samples_per_second': 1203.308, 'eval_steps_per_second': 77.726, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7396491169929504, 'eval_f1_micro': 0.6985645933014354, 'eval_f1_macro': 0.6941764367822876, 'eval_f1_class_0': 0.6559485530546624, 'eval_f1_class_1': 0.5714285714285714, 'eval_f1_class_2': 0.734982332155477, 'eval_f1_class_3': 0.78125, 'eval_f1_class_4': 0.7272727272727273, 'eval_runtime': 0.3507, 'eval_samples_per_second': 1191.799, 'eval_steps_per_second': 76.982, 'epoch': 3.0}
{'loss': 1.1138, 'grad_norm': 36.989070892333984, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7495226263999939, 'eval_f1_micro': 0.7248803827751196, 'eval_f1_macro': 0.7227904203021405, 'eval_f1_class_0': 0.6779661016949152, 'eval_f1_class_1': 0.6285714285714286, 'eval_f1_class_2': 0.7636363636363637, 'eval_f1_class_3': 0.7801418439716312, 'eval_f1_class_4': 0.7636363636363637, 'eval_runtime': 0.3592, 'eval_samples_per_second': 1163.652, 'eval_steps_per_second': 75.164, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6419551968574524, 'eval_f1_micro': 0.7535885167464115, 'eval_f1_macro': 0.7495215671541177, 'eval_f1_class_0': 0.7239263803680982, 'eval_f1_class_1': 0.6571428571428571, 'eval_f1_class_2': 0.7878787878787878, 'eval_f1_class_3': 0.8031496062992126, 'eval_f1_class_4': 0.7755102040816326, 'eval_runtime': 0.3499, 'eval_samples_per_second': 1194.561, 'eval_steps_per_second': 77.161, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6533801555633545, 'eval_f1_micro': 0.7511961722488039, 'eval_f1_macro': 0.7366985421586982, 'eval_f1_class_0': 0.7306501547987616, 'eval_f1_class_1': 0.6133333333333333, 'eval_f1_class_2': 0.7878787878787878, 'eval_f1_class_3': 0.8125, 'eval_f1_class_4': 0.7391304347826086, 'eval_runtime': 0.3405, 'eval_samples_per_second': 1227.78, 'eval_steps_per_second': 79.306, 'epoch': 6.0}
{'loss': 0.5226, 'grad_norm': 8.770017623901367, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6581401228904724, 'eval_f1_micro': 0.7559808612440191, 'eval_f1_macro': 0.7450544449329686, 'eval_f1_class_0': 0.7266881028938906, 'eval_f1_class_1': 0.6216216216216216, 'eval_f1_class_2': 0.7956204379562044, 'eval_f1_class_3': 0.8153846153846154, 'eval_f1_class_4': 0.7659574468085106, 'eval_runtime': 0.3444, 'eval_samples_per_second': 1213.627, 'eval_steps_per_second': 78.392, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6612743735313416, 'eval_f1_micro': 0.7942583732057417, 'eval_f1_macro': 0.7696955142247031, 'eval_f1_class_0': 0.8, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8284518828451883, 'eval_f1_class_3': 0.7874015748031497, 'eval_f1_class_4': 0.7659574468085106, 'eval_runtime': 0.3531, 'eval_samples_per_second': 1183.932, 'eval_steps_per_second': 76.474, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6424331068992615, 'eval_f1_micro': 0.784688995215311, 'eval_f1_macro': 0.7709584735433823, 'eval_f1_class_0': 0.7784431137724551, 'eval_f1_class_1': 0.6571428571428571, 'eval_f1_class_2': 0.8045977011494253, 'eval_f1_class_3': 0.832, 'eval_f1_class_4': 0.782608695652174, 'eval_runtime': 0.3573, 'eval_samples_per_second': 1170.027, 'eval_steps_per_second': 75.576, 'epoch': 9.0}
{'loss': 0.318, 'grad_norm': 27.25768280029297, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.65519779920578, 'eval_f1_micro': 0.8038277511961722, 'eval_f1_macro': 0.7826871526642694, 'eval_f1_class_0': 0.8070175438596491, 'eval_f1_class_1': 0.6571428571428571, 'eval_f1_class_2': 0.8253968253968254, 'eval_f1_class_3': 0.8412698412698413, 'eval_f1_class_4': 0.782608695652174, 'eval_runtime': 0.3487, 'eval_samples_per_second': 1198.66, 'eval_steps_per_second': 77.425, 'epoch': 10.0}
{'train_runtime': 93.3224, 'train_samples_per_second': 274.211, 'train_steps_per_second': 17.145, 'train_loss': 0.6280732011795044, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.042799472808838, 'eval_f1_micro': 0.6100478468899522, 'eval_f1_macro': 0.42275716423611476, 'eval_f1_class_0': 0.7130434782608696, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.5570776255707762, 'eval_f1_class_3': 0.5473684210526316, 'eval_f1_class_4': 0.2962962962962963, 'eval_runtime': 0.3767, 'eval_samples_per_second': 1109.665, 'eval_steps_per_second': 71.677, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7686070203781128, 'eval_f1_micro': 0.6770334928229665, 'eval_f1_macro': 0.6559597361268472, 'eval_f1_class_0': 0.6627218934911243, 'eval_f1_class_1': 0.5454545454545454, 'eval_f1_class_2': 0.732824427480916, 'eval_f1_class_3': 0.6721311475409836, 'eval_f1_class_4': 0.6666666666666666, 'eval_runtime': 0.3601, 'eval_samples_per_second': 1160.882, 'eval_steps_per_second': 74.985, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6225017309188843, 'eval_f1_micro': 0.7727272727272727, 'eval_f1_macro': 0.744980149571439, 'eval_f1_class_0': 0.783068783068783, 'eval_f1_class_1': 0.576271186440678, 'eval_f1_class_2': 0.8185654008438819, 'eval_f1_class_3': 0.7288135593220338, 'eval_f1_class_4': 0.8181818181818182, 'eval_runtime': 0.3557, 'eval_samples_per_second': 1175.166, 'eval_steps_per_second': 75.908, 'epoch': 3.0}
{'loss': 1.0383, 'grad_norm': 20.70261573791504, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5966075658798218, 'eval_f1_micro': 0.7894736842105263, 'eval_f1_macro': 0.7651044144361687, 'eval_f1_class_0': 0.7977207977207977, 'eval_f1_class_1': 0.5714285714285714, 'eval_f1_class_2': 0.8249027237354085, 'eval_f1_class_3': 0.7619047619047619, 'eval_f1_class_4': 0.8695652173913043, 'eval_runtime': 0.3661, 'eval_samples_per_second': 1141.713, 'eval_steps_per_second': 73.747, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.510362982749939, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7774805340379111, 'eval_f1_class_0': 0.8351648351648352, 'eval_f1_class_1': 0.6071428571428571, 'eval_f1_class_2': 0.8650793650793651, 'eval_f1_class_3': 0.7704918032786885, 'eval_f1_class_4': 0.8095238095238095, 'eval_runtime': 0.3553, 'eval_samples_per_second': 1176.447, 'eval_steps_per_second': 75.991, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.549788236618042, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7819867861005464, 'eval_f1_class_0': 0.8370786516853933, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.831275720164609, 'eval_f1_class_3': 0.8130081300813008, 'eval_f1_class_4': 0.7619047619047619, 'eval_runtime': 0.3735, 'eval_samples_per_second': 1119.266, 'eval_steps_per_second': 72.297, 'epoch': 6.0}
{'loss': 0.4256, 'grad_norm': 23.1600341796875, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5039359927177429, 'eval_f1_micro': 0.8086124401913876, 'eval_f1_macro': 0.7821133729084174, 'eval_f1_class_0': 0.8059701492537313, 'eval_f1_class_1': 0.7096774193548387, 'eval_f1_class_2': 0.8475836431226765, 'eval_f1_class_3': 0.8031496062992126, 'eval_f1_class_4': 0.7441860465116279, 'eval_runtime': 0.4002, 'eval_samples_per_second': 1044.592, 'eval_steps_per_second': 67.474, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5344513654708862, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.806221749645099, 'eval_f1_class_0': 0.8220858895705522, 'eval_f1_class_1': 0.7246376811594203, 'eval_f1_class_2': 0.8614232209737828, 'eval_f1_class_3': 0.796875, 'eval_f1_class_4': 0.8260869565217391, 'eval_runtime': 0.3679, 'eval_samples_per_second': 1136.063, 'eval_steps_per_second': 73.382, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5771923065185547, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8090181207224303, 'eval_f1_class_0': 0.8470588235294118, 'eval_f1_class_1': 0.7096774193548387, 'eval_f1_class_2': 0.8560311284046692, 'eval_f1_class_3': 0.7878787878787878, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3789, 'eval_samples_per_second': 1103.147, 'eval_steps_per_second': 71.256, 'epoch': 9.0}
{'loss': 0.2096, 'grad_norm': 2.2531754970550537, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5451768636703491, 'eval_f1_micro': 0.8373205741626795, 'eval_f1_macro': 0.8173210157421972, 'eval_f1_class_0': 0.8436578171091446, 'eval_f1_class_1': 0.71875, 'eval_f1_class_2': 0.8702290076335878, 'eval_f1_class_3': 0.8095238095238095, 'eval_f1_class_4': 0.8444444444444444, 'eval_runtime': 0.3771, 'eval_samples_per_second': 1108.5, 'eval_steps_per_second': 71.602, 'epoch': 10.0}
{'train_runtime': 95.8832, 'train_samples_per_second': 266.887, 'train_steps_per_second': 16.687, 'train_loss': 0.5328967887163162, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2430005073547363, 'eval_f1_micro': 0.5430622009569378, 'eval_f1_macro': 0.26771891761130623, 'eval_f1_class_0': 0.6836027713625866, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.5319148936170213, 'eval_f1_class_3': 0.12307692307692308, 'eval_f1_class_4': 0.0, 'eval_runtime': 0.3784, 'eval_samples_per_second': 1104.78, 'eval_steps_per_second': 71.361, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8579276204109192, 'eval_f1_micro': 0.6220095693779905, 'eval_f1_macro': 0.5784970030086798, 'eval_f1_class_0': 0.573208722741433, 'eval_f1_class_1': 0.4090909090909091, 'eval_f1_class_2': 0.7016393442622951, 'eval_f1_class_3': 0.6504065040650406, 'eval_f1_class_4': 0.5581395348837209, 'eval_runtime': 0.3759, 'eval_samples_per_second': 1112.055, 'eval_steps_per_second': 71.831, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7627629041671753, 'eval_f1_micro': 0.6794258373205742, 'eval_f1_macro': 0.6584748442056264, 'eval_f1_class_0': 0.6981627296587927, 'eval_f1_class_1': 0.5352112676056338, 'eval_f1_class_2': 0.6880733944954128, 'eval_f1_class_3': 0.688, 'eval_f1_class_4': 0.6829268292682927, 'eval_runtime': 0.3669, 'eval_samples_per_second': 1139.372, 'eval_steps_per_second': 73.596, 'epoch': 3.0}
{'loss': 1.1323, 'grad_norm': 9.541669845581055, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7099922299385071, 'eval_f1_micro': 0.7177033492822966, 'eval_f1_macro': 0.7002939188846863, 'eval_f1_class_0': 0.6881028938906752, 'eval_f1_class_1': 0.5542168674698795, 'eval_f1_class_2': 0.7799227799227799, 'eval_f1_class_3': 0.7681159420289855, 'eval_f1_class_4': 0.7111111111111111, 'eval_runtime': 0.379, 'eval_samples_per_second': 1102.861, 'eval_steps_per_second': 71.237, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5697808861732483, 'eval_f1_micro': 0.7966507177033493, 'eval_f1_macro': 0.760659146035972, 'eval_f1_class_0': 0.8032786885245902, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.8605577689243028, 'eval_f1_class_3': 0.7433628318584071, 'eval_f1_class_4': 0.7692307692307693, 'eval_runtime': 0.3712, 'eval_samples_per_second': 1126.161, 'eval_steps_per_second': 72.742, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.574973464012146, 'eval_f1_micro': 0.7822966507177034, 'eval_f1_macro': 0.7594294804064736, 'eval_f1_class_0': 0.774390243902439, 'eval_f1_class_1': 0.5882352941176471, 'eval_f1_class_2': 0.8452830188679246, 'eval_f1_class_3': 0.7559055118110236, 'eval_f1_class_4': 0.8333333333333334, 'eval_runtime': 0.3735, 'eval_samples_per_second': 1119.166, 'eval_steps_per_second': 72.291, 'epoch': 6.0}
{'loss': 0.5127, 'grad_norm': 19.202306747436523, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6630871295928955, 'eval_f1_micro': 0.7703349282296651, 'eval_f1_macro': 0.7413263339310145, 'eval_f1_class_0': 0.749185667752443, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.8389513108614233, 'eval_f1_class_3': 0.7746478873239436, 'eval_f1_class_4': 0.7169811320754716, 'eval_runtime': 0.369, 'eval_samples_per_second': 1132.877, 'eval_steps_per_second': 73.176, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5823187828063965, 'eval_f1_micro': 0.8038277511961722, 'eval_f1_macro': 0.7790451669993874, 'eval_f1_class_0': 0.8071216617210683, 'eval_f1_class_1': 0.6865671641791045, 'eval_f1_class_2': 0.8482490272373541, 'eval_f1_class_3': 0.7777777777777778, 'eval_f1_class_4': 0.7755102040816326, 'eval_runtime': 0.5375, 'eval_samples_per_second': 777.655, 'eval_steps_per_second': 50.231, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5785344839096069, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7879171767726558, 'eval_f1_class_0': 0.8165680473372781, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8571428571428571, 'eval_f1_class_3': 0.7906976744186046, 'eval_f1_class_4': 0.8085106382978723, 'eval_runtime': 0.4076, 'eval_samples_per_second': 1025.55, 'eval_steps_per_second': 66.244, 'epoch': 9.0}
{'loss': 0.2872, 'grad_norm': 16.534610748291016, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5981032848358154, 'eval_f1_micro': 0.80622009569378, 'eval_f1_macro': 0.7836887296461764, 'eval_f1_class_0': 0.8048048048048048, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8538461538461538, 'eval_f1_class_3': 0.7846153846153846, 'eval_f1_class_4': 0.8085106382978723, 'eval_runtime': 0.369, 'eval_samples_per_second': 1132.862, 'eval_steps_per_second': 73.175, 'epoch': 10.0}
{'train_runtime': 96.3339, 'train_samples_per_second': 265.639, 'train_steps_per_second': 16.609, 'train_loss': 0.617281277179718, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.148702621459961, 'eval_f1_micro': 0.5741626794258373, 'eval_f1_macro': 0.3751688106926199, 'eval_f1_class_0': 0.6615776081424937, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.6098360655737705, 'eval_f1_class_3': 0.35443037974683544, 'eval_f1_class_4': 0.25, 'eval_runtime': 0.3917, 'eval_samples_per_second': 1067.163, 'eval_steps_per_second': 68.932, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8342894315719604, 'eval_f1_micro': 0.65311004784689, 'eval_f1_macro': 0.6364655451841338, 'eval_f1_class_0': 0.6325301204819277, 'eval_f1_class_1': 0.5671641791044776, 'eval_f1_class_2': 0.7084870848708487, 'eval_f1_class_3': 0.64, 'eval_f1_class_4': 0.6341463414634146, 'eval_runtime': 0.3663, 'eval_samples_per_second': 1141.199, 'eval_steps_per_second': 73.714, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7216622233390808, 'eval_f1_micro': 0.6985645933014354, 'eval_f1_macro': 0.6944075650870262, 'eval_f1_class_0': 0.7016574585635359, 'eval_f1_class_1': 0.6172839506172839, 'eval_f1_class_2': 0.7125506072874493, 'eval_f1_class_3': 0.7037037037037037, 'eval_f1_class_4': 0.7368421052631579, 'eval_runtime': 0.3903, 'eval_samples_per_second': 1070.947, 'eval_steps_per_second': 69.176, 'epoch': 3.0}
{'loss': 1.126, 'grad_norm': 25.61093521118164, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6541088819503784, 'eval_f1_micro': 0.7535885167464115, 'eval_f1_macro': 0.7264644883741462, 'eval_f1_class_0': 0.7736389684813754, 'eval_f1_class_1': 0.5428571428571428, 'eval_f1_class_2': 0.7815126050420168, 'eval_f1_class_3': 0.75, 'eval_f1_class_4': 0.7843137254901961, 'eval_runtime': 0.4032, 'eval_samples_per_second': 1036.795, 'eval_steps_per_second': 66.97, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6023580431938171, 'eval_f1_micro': 0.7894736842105263, 'eval_f1_macro': 0.7652464037102258, 'eval_f1_class_0': 0.8034188034188035, 'eval_f1_class_1': 0.5846153846153846, 'eval_f1_class_2': 0.8215767634854771, 'eval_f1_class_3': 0.7794117647058824, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.3963, 'eval_samples_per_second': 1054.692, 'eval_steps_per_second': 68.126, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.534509003162384, 'eval_f1_micro': 0.8110047846889952, 'eval_f1_macro': 0.7724644174909766, 'eval_f1_class_0': 0.8106508875739645, 'eval_f1_class_1': 0.6376811594202898, 'eval_f1_class_2': 0.8740740740740741, 'eval_f1_class_3': 0.7899159663865546, 'eval_f1_class_4': 0.75, 'eval_runtime': 0.393, 'eval_samples_per_second': 1063.618, 'eval_steps_per_second': 68.703, 'epoch': 6.0}
{'loss': 0.5315, 'grad_norm': 19.291519165039062, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5317503809928894, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7831168997835605, 'eval_f1_class_0': 0.8059701492537313, 'eval_f1_class_1': 0.631578947368421, 'eval_f1_class_2': 0.8715953307392996, 'eval_f1_class_3': 0.8372093023255814, 'eval_f1_class_4': 0.7692307692307693, 'eval_runtime': 0.465, 'eval_samples_per_second': 898.835, 'eval_steps_per_second': 58.059, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5373445153236389, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8039082651991816, 'eval_f1_class_0': 0.8203592814371258, 'eval_f1_class_1': 0.7058823529411765, 'eval_f1_class_2': 0.896551724137931, 'eval_f1_class_3': 0.7967479674796748, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3949, 'eval_samples_per_second': 1058.409, 'eval_steps_per_second': 68.366, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.550137996673584, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.8010323384851574, 'eval_f1_class_0': 0.8308605341246291, 'eval_f1_class_1': 0.6857142857142857, 'eval_f1_class_2': 0.8725868725868726, 'eval_f1_class_3': 0.816, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.5883, 'eval_samples_per_second': 710.481, 'eval_steps_per_second': 45.892, 'epoch': 9.0}
{'loss': 0.307, 'grad_norm': 25.95094108581543, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5589311122894287, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.8014969087540427, 'eval_f1_class_0': 0.8294117647058824, 'eval_f1_class_1': 0.6956521739130435, 'eval_f1_class_2': 0.875968992248062, 'eval_f1_class_3': 0.8064516129032258, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3957, 'eval_samples_per_second': 1056.274, 'eval_steps_per_second': 68.228, 'epoch': 10.0}
{'train_runtime': 99.5721, 'train_samples_per_second': 257.0, 'train_steps_per_second': 16.069, 'train_loss': 0.629484829902649, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

In [10]:
# Calculate mean and std of F1 scores
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]  # Adjust range based on num_classes

avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }

# Print results
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")

Average test_f1_micro: 0.7557 ± 0.0318
Average test_f1_macro: 0.7458 ± 0.0331
Average test_f1_class_0: 0.7687 ± 0.0411
Average test_f1_class_1: 0.7384 ± 0.0281
Average test_f1_class_2: 0.7892 ± 0.0379
Average test_f1_class_3: 0.6426 ± 0.0286
Average test_f1_class_4: 0.7902 ± 0.0552
