In [1]:
import evaluate
import numpy as np
from datasets import load_from_disk, disable_caching
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [2]:
# import random
# seeds = [random.randint(0, 1e9) for _ in range(5)]
# seeds

In [3]:
disable_caching()

In [4]:
num_labels = 5
id2label = {
    0: "reject",
    1: "B_supplies_A",
    2: "A_supplies_B",
    3: "ambiguous",
    4: "ownership",
}
label2id = {
    "reject": 0,
    "B_supplies_A": 1,
    "A_supplies_B": 2,
    "ambiguous": 3,
    "ownership": 4,
}
metric = evaluate.load("f1")

In [5]:
model_name = "google-bert/bert-base-uncased" #uncased
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(
    {"additional_special_tokens": ["__NE_FROM__", "__NE_TO__", "__NE_OTHER__"]}
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_classwise = f1_score(labels, predictions, average=None)

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        **{f"f1_class_{i}": score for i, score in enumerate(f1_classwise)}
    }


def model_init():
    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=5,
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    return model

In [8]:
ds = load_from_disk("../../datasets/ManualDataset")
ds = ds.select_columns(["masked_text", "label"])
ds = ds.rename_column("masked_text", "text")
ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [9]:
def run_experiment(seed):
    set_seed(seed)
    training_args = TrainingArguments(
        seed=seed,
        data_seed=seed,
        tf32=True,
        output_dir="logs/experiment_1_bert",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to=[],
        save_only_model=True,
    )
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    test_results = trainer.predict(ds["test"])
    return test_results

In [10]:
seeds = [947706532, 219089540, 567388559, 220752674, 837810645]
all_results = []

for seed in seeds:
    results = run_experiment(seed)
    all_results.append(results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.0666015148162842, 'eval_f1_micro': 0.6339712918660287, 'eval_f1_macro': 0.4096190608483982, 'eval_f1_class_0': 0.7333333333333333, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.6837060702875399, 'eval_f1_class_3': 0.45714285714285713, 'eval_f1_class_4': 0.17391304347826086, 'eval_runtime': 0.3688, 'eval_samples_per_second': 1133.484, 'eval_steps_per_second': 73.215, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.667560338973999, 'eval_f1_micro': 0.7727272727272727, 'eval_f1_macro': 0.7369441444203652, 'eval_f1_class_0': 0.8072289156626506, 'eval_f1_class_1': 0.6578947368421053, 'eval_f1_class_2': 0.7838827838827839, 'eval_f1_class_3': 0.75, 'eval_f1_class_4': 0.6857142857142857, 'eval_runtime': 0.3434, 'eval_samples_per_second': 1217.088, 'eval_steps_per_second': 78.616, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5893464684486389, 'eval_f1_micro': 0.8014354066985646, 'eval_f1_macro': 0.7646923374744435, 'eval_f1_class_0': 0.838150289017341, 'eval_f1_class_1': 0.6923076923076923, 'eval_f1_class_2': 0.8076923076923077, 'eval_f1_class_3': 0.782608695652174, 'eval_f1_class_4': 0.7027027027027027, 'eval_runtime': 0.3715, 'eval_samples_per_second': 1125.169, 'eval_steps_per_second': 72.678, 'epoch': 3.0}
{'loss': 0.9646, 'grad_norm': 10.84957504272461, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5483032464981079, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7909658235228478, 'eval_f1_class_0': 0.8504398826979472, 'eval_f1_class_1': 0.7647058823529411, 'eval_f1_class_2': 0.8134328358208955, 'eval_f1_class_3': 0.7457627118644068, 'eval_f1_class_4': 0.7804878048780488, 'eval_runtime': 0.4201, 'eval_samples_per_second': 994.947, 'eval_steps_per_second': 64.267, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6313953995704651, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7947723638814507, 'eval_f1_class_0': 0.8449848024316109, 'eval_f1_class_1': 0.704225352112676, 'eval_f1_class_2': 0.8365019011406845, 'eval_f1_class_3': 0.7786259541984732, 'eval_f1_class_4': 0.8095238095238095, 'eval_runtime': 0.3269, 'eval_samples_per_second': 1278.812, 'eval_steps_per_second': 82.603, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6978457570075989, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.7861172519237035, 'eval_f1_class_0': 0.8563049853372434, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8307692307692308, 'eval_f1_class_3': 0.7777777777777778, 'eval_f1_class_4': 0.7272727272727273, 'eval_runtime': 0.3246, 'eval_samples_per_second': 1287.8, 'eval_steps_per_second': 83.183, 'epoch': 6.0}
{'loss': 0.2627, 'grad_norm': 10.324856758117676, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7331691384315491, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.793906034232488, 'eval_f1_class_0': 0.852760736196319, 'eval_f1_class_1': 0.704225352112676, 'eval_f1_class_2': 0.8444444444444444, 'eval_f1_class_3': 0.7786259541984732, 'eval_f1_class_4': 0.7894736842105263, 'eval_runtime': 0.3425, 'eval_samples_per_second': 1220.493, 'eval_steps_per_second': 78.836, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8165865540504456, 'eval_f1_micro': 0.8373205741626795, 'eval_f1_macro': 0.8192385051504913, 'eval_f1_class_0': 0.8613569321533924, 'eval_f1_class_1': 0.7575757575757576, 'eval_f1_class_2': 0.8449612403100775, 'eval_f1_class_3': 0.803030303030303, 'eval_f1_class_4': 0.8292682926829268, 'eval_runtime': 0.3404, 'eval_samples_per_second': 1228.109, 'eval_steps_per_second': 79.328, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8252251148223877, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8069159506633163, 'eval_f1_class_0': 0.8579881656804734, 'eval_f1_class_1': 0.7123287671232876, 'eval_f1_class_2': 0.84375, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.8205128205128205, 'eval_runtime': 0.354, 'eval_samples_per_second': 1180.842, 'eval_steps_per_second': 76.274, 'epoch': 9.0}
{'loss': 0.097, 'grad_norm': 1.0146762132644653, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.826194167137146, 'eval_f1_micro': 0.8325358851674641, 'eval_f1_macro': 0.8109810566179881, 'eval_f1_class_0': 0.8597014925373134, 'eval_f1_class_1': 0.7297297297297297, 'eval_f1_class_2': 0.8449612403100775, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.8205128205128205, 'eval_runtime': 0.3194, 'eval_samples_per_second': 1308.86, 'eval_steps_per_second': 84.544, 'epoch': 10.0}
{'train_runtime': 84.9319, 'train_samples_per_second': 301.3, 'train_steps_per_second': 18.839, 'train_loss': 0.41783868610858915, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.201750636100769, 'eval_f1_micro': 0.5741626794258373, 'eval_f1_macro': 0.31122590727556865, 'eval_f1_class_0': 0.690744920993228, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.6153846153846154, 'eval_f1_class_3': 0.0, 'eval_f1_class_4': 0.25, 'eval_runtime': 0.3387, 'eval_samples_per_second': 1234.162, 'eval_steps_per_second': 79.719, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7004815340042114, 'eval_f1_micro': 0.7703349282296651, 'eval_f1_macro': 0.7059150864842261, 'eval_f1_class_0': 0.8117647058823529, 'eval_f1_class_1': 0.5660377358490566, 'eval_f1_class_2': 0.8014440433212996, 'eval_f1_class_3': 0.71875, 'eval_f1_class_4': 0.631578947368421, 'eval_runtime': 0.3892, 'eval_samples_per_second': 1074.035, 'eval_steps_per_second': 69.375, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6477596759796143, 'eval_f1_micro': 0.7488038277511961, 'eval_f1_macro': 0.7090212113765784, 'eval_f1_class_0': 0.7821522309711286, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.7603305785123967, 'eval_f1_class_3': 0.7090909090909091, 'eval_f1_class_4': 0.6666666666666666, 'eval_runtime': 0.3235, 'eval_samples_per_second': 1291.936, 'eval_steps_per_second': 83.45, 'epoch': 3.0}
{'loss': 1.0195, 'grad_norm': 8.861555099487305, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6447877883911133, 'eval_f1_micro': 0.784688995215311, 'eval_f1_macro': 0.7552009091974986, 'eval_f1_class_0': 0.8246153846153846, 'eval_f1_class_1': 0.6086956521739131, 'eval_f1_class_2': 0.7985074626865671, 'eval_f1_class_3': 0.7441860465116279, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3464, 'eval_samples_per_second': 1206.733, 'eval_steps_per_second': 77.947, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6933609843254089, 'eval_f1_micro': 0.7822966507177034, 'eval_f1_macro': 0.7584378747832986, 'eval_f1_class_0': 0.8073394495412844, 'eval_f1_class_1': 0.6172839506172839, 'eval_f1_class_2': 0.8156862745098039, 'eval_f1_class_3': 0.7518796992481203, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3466, 'eval_samples_per_second': 1206.037, 'eval_steps_per_second': 77.902, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7045490145683289, 'eval_f1_micro': 0.80622009569378, 'eval_f1_macro': 0.7901114488348531, 'eval_f1_class_0': 0.8267477203647416, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.8222222222222222, 'eval_f1_class_3': 0.7777777777777778, 'eval_f1_class_4': 0.8571428571428571, 'eval_runtime': 0.3462, 'eval_samples_per_second': 1207.457, 'eval_steps_per_second': 77.994, 'epoch': 6.0}
{'loss': 0.296, 'grad_norm': 6.627720832824707, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.776097297668457, 'eval_f1_micro': 0.8014354066985646, 'eval_f1_macro': 0.7852179641014336, 'eval_f1_class_0': 0.8121212121212121, 'eval_f1_class_1': 0.684931506849315, 'eval_f1_class_2': 0.8239700374531835, 'eval_f1_class_3': 0.7868852459016393, 'eval_f1_class_4': 0.8181818181818182, 'eval_runtime': 0.3323, 'eval_samples_per_second': 1258.022, 'eval_steps_per_second': 81.26, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7929094433784485, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.799060869899517, 'eval_f1_class_0': 0.838150289017341, 'eval_f1_class_1': 0.704225352112676, 'eval_f1_class_2': 0.8253968253968254, 'eval_f1_class_3': 0.7903225806451613, 'eval_f1_class_4': 0.8372093023255814, 'eval_runtime': 0.3406, 'eval_samples_per_second': 1227.09, 'eval_steps_per_second': 79.262, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8940326571464539, 'eval_f1_micro': 0.7751196172248804, 'eval_f1_macro': 0.7587153967136601, 'eval_f1_class_0': 0.7756410256410257, 'eval_f1_class_1': 0.6756756756756757, 'eval_f1_class_2': 0.8104089219330854, 'eval_f1_class_3': 0.7591240875912408, 'eval_f1_class_4': 0.7727272727272727, 'eval_runtime': 0.3162, 'eval_samples_per_second': 1322.126, 'eval_steps_per_second': 85.4, 'epoch': 9.0}
{'loss': 0.1273, 'grad_norm': 8.503562927246094, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8407047986984253, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7963750661702285, 'eval_f1_class_0': 0.8220858895705522, 'eval_f1_class_1': 0.7123287671232876, 'eval_f1_class_2': 0.8314606741573034, 'eval_f1_class_3': 0.816, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3403, 'eval_samples_per_second': 1228.255, 'eval_steps_per_second': 79.337, 'epoch': 10.0}
{'train_runtime': 84.4092, 'train_samples_per_second': 303.166, 'train_steps_per_second': 18.955, 'train_loss': 0.4563434076309204, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.131316065788269, 'eval_f1_micro': 0.6028708133971292, 'eval_f1_macro': 0.37823152967629303, 'eval_f1_class_0': 0.7240356083086054, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.639344262295082, 'eval_f1_class_3': 0.2777777777777778, 'eval_f1_class_4': 0.25, 'eval_runtime': 0.3413, 'eval_samples_per_second': 1224.8, 'eval_steps_per_second': 79.114, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7159392237663269, 'eval_f1_micro': 0.7751196172248804, 'eval_f1_macro': 0.727269929067047, 'eval_f1_class_0': 0.8181818181818182, 'eval_f1_class_1': 0.5517241379310345, 'eval_f1_class_2': 0.7984790874524715, 'eval_f1_class_3': 0.7079646017699115, 'eval_f1_class_4': 0.76, 'eval_runtime': 0.3329, 'eval_samples_per_second': 1255.47, 'eval_steps_per_second': 81.095, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6161025762557983, 'eval_f1_micro': 0.7822966507177034, 'eval_f1_macro': 0.7590286484961117, 'eval_f1_class_0': 0.8082595870206489, 'eval_f1_class_1': 0.6666666666666666, 'eval_f1_class_2': 0.7862595419847328, 'eval_f1_class_3': 0.768, 'eval_f1_class_4': 0.7659574468085106, 'eval_runtime': 0.3354, 'eval_samples_per_second': 1246.18, 'eval_steps_per_second': 80.495, 'epoch': 3.0}
{'loss': 0.9974, 'grad_norm': 9.679635047912598, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.5632702112197876, 'eval_f1_micro': 0.8110047846889952, 'eval_f1_macro': 0.7825797889347333, 'eval_f1_class_0': 0.8367952522255193, 'eval_f1_class_1': 0.676056338028169, 'eval_f1_class_2': 0.8212927756653993, 'eval_f1_class_3': 0.8095238095238095, 'eval_f1_class_4': 0.7692307692307693, 'eval_runtime': 0.3182, 'eval_samples_per_second': 1313.822, 'eval_steps_per_second': 84.864, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6238910555839539, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.7847405912049148, 'eval_f1_class_0': 0.8387096774193549, 'eval_f1_class_1': 0.6865671641791045, 'eval_f1_class_2': 0.8416988416988417, 'eval_f1_class_3': 0.784, 'eval_f1_class_4': 0.7727272727272727, 'eval_runtime': 0.3238, 'eval_samples_per_second': 1290.763, 'eval_steps_per_second': 83.375, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7015565037727356, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.808995210646817, 'eval_f1_class_0': 0.8271604938271605, 'eval_f1_class_1': 0.7297297297297297, 'eval_f1_class_2': 0.8191881918819188, 'eval_f1_class_3': 0.8188976377952756, 'eval_f1_class_4': 0.85, 'eval_runtime': 0.3135, 'eval_samples_per_second': 1333.429, 'eval_steps_per_second': 86.131, 'epoch': 6.0}
{'loss': 0.2879, 'grad_norm': 3.69888973236084, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7483857870101929, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8170454718945457, 'eval_f1_class_0': 0.8486646884272997, 'eval_f1_class_1': 0.75, 'eval_f1_class_2': 0.842911877394636, 'eval_f1_class_3': 0.7936507936507936, 'eval_f1_class_4': 0.85, 'eval_runtime': 0.3347, 'eval_samples_per_second': 1248.992, 'eval_steps_per_second': 80.677, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8294371962547302, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.7971554901022326, 'eval_f1_class_0': 0.8513119533527697, 'eval_f1_class_1': 0.7352941176470589, 'eval_f1_class_2': 0.8404669260700389, 'eval_f1_class_3': 0.7692307692307693, 'eval_f1_class_4': 0.7894736842105263, 'eval_runtime': 0.3218, 'eval_samples_per_second': 1298.927, 'eval_steps_per_second': 83.902, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.861266553401947, 'eval_f1_micro': 0.8229665071770335, 'eval_f1_macro': 0.8026983704560637, 'eval_f1_class_0': 0.8433734939759037, 'eval_f1_class_1': 0.7397260273972602, 'eval_f1_class_2': 0.8396946564885496, 'eval_f1_class_3': 0.7906976744186046, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3267, 'eval_samples_per_second': 1279.643, 'eval_steps_per_second': 82.656, 'epoch': 9.0}
{'loss': 0.0971, 'grad_norm': 0.13399654626846313, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8864611983299255, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7984397663391812, 'eval_f1_class_0': 0.8353658536585366, 'eval_f1_class_1': 0.7323943661971831, 'eval_f1_class_2': 0.837037037037037, 'eval_f1_class_3': 0.7874015748031497, 'eval_f1_class_4': 0.8, 'eval_runtime': 0.3355, 'eval_samples_per_second': 1245.995, 'eval_steps_per_second': 80.483, 'epoch': 10.0}
{'train_runtime': 82.8077, 'train_samples_per_second': 309.029, 'train_steps_per_second': 19.322, 'train_loss': 0.436208678483963, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.0871391296386719, 'eval_f1_micro': 0.645933014354067, 'eval_f1_macro': 0.3945339090678181, 'eval_f1_class_0': 0.7296587926509186, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.7096774193548387, 'eval_f1_class_3': 0.5333333333333333, 'eval_f1_class_4': 0.0, 'eval_runtime': 0.3412, 'eval_samples_per_second': 1224.957, 'eval_steps_per_second': 79.124, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7410889863967896, 'eval_f1_micro': 0.7464114832535885, 'eval_f1_macro': 0.6564091772294793, 'eval_f1_class_0': 0.7922437673130194, 'eval_f1_class_1': 0.5, 'eval_f1_class_2': 0.7956989247311828, 'eval_f1_class_3': 0.6486486486486487, 'eval_f1_class_4': 0.5454545454545454, 'eval_runtime': 0.3189, 'eval_samples_per_second': 1310.688, 'eval_steps_per_second': 84.662, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6099250316619873, 'eval_f1_micro': 0.7966507177033493, 'eval_f1_macro': 0.7533500502988343, 'eval_f1_class_0': 0.8197674418604651, 'eval_f1_class_1': 0.6875, 'eval_f1_class_2': 0.8294573643410853, 'eval_f1_class_3': 0.7633587786259542, 'eval_f1_class_4': 0.6666666666666666, 'eval_runtime': 0.3233, 'eval_samples_per_second': 1292.803, 'eval_steps_per_second': 83.506, 'epoch': 3.0}
{'loss': 0.9958, 'grad_norm': 11.748149871826172, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6106935143470764, 'eval_f1_micro': 0.8014354066985646, 'eval_f1_macro': 0.7666654074652405, 'eval_f1_class_0': 0.8189910979228486, 'eval_f1_class_1': 0.684931506849315, 'eval_f1_class_2': 0.8262548262548263, 'eval_f1_class_3': 0.8031496062992126, 'eval_f1_class_4': 0.7, 'eval_runtime': 0.3189, 'eval_samples_per_second': 1310.929, 'eval_steps_per_second': 84.677, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6339086294174194, 'eval_f1_micro': 0.8086124401913876, 'eval_f1_macro': 0.7771055214921386, 'eval_f1_class_0': 0.8242424242424242, 'eval_f1_class_1': 0.7142857142857143, 'eval_f1_class_2': 0.8327137546468402, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.7142857142857143, 'eval_runtime': 0.3367, 'eval_samples_per_second': 1241.427, 'eval_steps_per_second': 80.188, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6851766705513, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.7855867353934747, 'eval_f1_class_0': 0.8357348703170029, 'eval_f1_class_1': 0.704225352112676, 'eval_f1_class_2': 0.8292682926829268, 'eval_f1_class_3': 0.8031496062992126, 'eval_f1_class_4': 0.7555555555555555, 'eval_runtime': 0.3141, 'eval_samples_per_second': 1330.95, 'eval_steps_per_second': 85.97, 'epoch': 6.0}
{'loss': 0.2801, 'grad_norm': 15.883919715881348, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7323986291885376, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.7996917494477052, 'eval_f1_class_0': 0.8486646884272997, 'eval_f1_class_1': 0.75, 'eval_f1_class_2': 0.8507462686567164, 'eval_f1_class_3': 0.8099173553719008, 'eval_f1_class_4': 0.7391304347826086, 'eval_runtime': 0.3292, 'eval_samples_per_second': 1269.684, 'eval_steps_per_second': 82.013, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8114827871322632, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.7997299906477208, 'eval_f1_class_0': 0.8505747126436781, 'eval_f1_class_1': 0.75, 'eval_f1_class_2': 0.84251968503937, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.7555555555555555, 'eval_runtime': 0.3242, 'eval_samples_per_second': 1289.329, 'eval_steps_per_second': 83.282, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8202599287033081, 'eval_f1_micro': 0.8181818181818182, 'eval_f1_macro': 0.7884046876467635, 'eval_f1_class_0': 0.8411764705882353, 'eval_f1_class_1': 0.7384615384615385, 'eval_f1_class_2': 0.8294573643410853, 'eval_f1_class_3': 0.8095238095238095, 'eval_f1_class_4': 0.723404255319149, 'eval_runtime': 0.3264, 'eval_samples_per_second': 1280.684, 'eval_steps_per_second': 82.724, 'epoch': 9.0}
{'loss': 0.1001, 'grad_norm': 7.692174434661865, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8553587794303894, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.7891981490379238, 'eval_f1_class_0': 0.8373493975903614, 'eval_f1_class_1': 0.75, 'eval_f1_class_2': 0.844106463878327, 'eval_f1_class_3': 0.8062015503875969, 'eval_f1_class_4': 0.7083333333333334, 'eval_runtime': 0.3066, 'eval_samples_per_second': 1363.486, 'eval_steps_per_second': 88.072, 'epoch': 10.0}
{'train_runtime': 83.115, 'train_samples_per_second': 307.887, 'train_steps_per_second': 19.25, 'train_loss': 0.4331916159391403, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.0623990297317505, 'eval_f1_micro': 0.6244019138755981, 'eval_f1_macro': 0.3150378787878788, 'eval_f1_class_0': 0.78125, 'eval_f1_class_1': 0.0, 'eval_f1_class_2': 0.6424242424242425, 'eval_f1_class_3': 0.15151515151515152, 'eval_f1_class_4': 0.0, 'eval_runtime': 0.3143, 'eval_samples_per_second': 1329.871, 'eval_steps_per_second': 85.901, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7352484464645386, 'eval_f1_micro': 0.7679425837320574, 'eval_f1_macro': 0.6797040089598053, 'eval_f1_class_0': 0.8148148148148148, 'eval_f1_class_1': 0.6268656716417911, 'eval_f1_class_2': 0.7938931297709924, 'eval_f1_class_3': 0.734375, 'eval_f1_class_4': 0.42857142857142855, 'eval_runtime': 0.311, 'eval_samples_per_second': 1344.098, 'eval_steps_per_second': 86.82, 'epoch': 2.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.590287446975708, 'eval_f1_micro': 0.7918660287081339, 'eval_f1_macro': 0.7469269699364395, 'eval_f1_class_0': 0.8252148997134671, 'eval_f1_class_1': 0.6875, 'eval_f1_class_2': 0.8076923076923077, 'eval_f1_class_3': 0.7642276422764228, 'eval_f1_class_4': 0.65, 'eval_runtime': 0.3076, 'eval_samples_per_second': 1358.981, 'eval_steps_per_second': 87.781, 'epoch': 3.0}
{'loss': 0.9959, 'grad_norm': 12.375228881835938, 'learning_rate': 1.5277777777777777e-05, 'epoch': 3.12}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.673326849937439, 'eval_f1_micro': 0.784688995215311, 'eval_f1_macro': 0.7490351823458734, 'eval_f1_class_0': 0.8096676737160121, 'eval_f1_class_1': 0.6865671641791045, 'eval_f1_class_2': 0.8199233716475096, 'eval_f1_class_3': 0.7313432835820896, 'eval_f1_class_4': 0.6976744186046512, 'eval_runtime': 0.3159, 'eval_samples_per_second': 1323.255, 'eval_steps_per_second': 85.473, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6448952555656433, 'eval_f1_micro': 0.7990430622009569, 'eval_f1_macro': 0.7534853230682638, 'eval_f1_class_0': 0.8246153846153846, 'eval_f1_class_1': 0.6756756756756757, 'eval_f1_class_2': 0.8158844765342961, 'eval_f1_class_3': 0.819672131147541, 'eval_f1_class_4': 0.631578947368421, 'eval_runtime': 0.3559, 'eval_samples_per_second': 1174.357, 'eval_steps_per_second': 75.856, 'epoch': 5.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7007774710655212, 'eval_f1_micro': 0.8157894736842105, 'eval_f1_macro': 0.7747874248513635, 'eval_f1_class_0': 0.8385093167701864, 'eval_f1_class_1': 0.7352941176470589, 'eval_f1_class_2': 0.8455882352941176, 'eval_f1_class_3': 0.7878787878787878, 'eval_f1_class_4': 0.6666666666666666, 'eval_runtime': 0.3105, 'eval_samples_per_second': 1346.212, 'eval_steps_per_second': 86.956, 'epoch': 6.0}
{'loss': 0.2942, 'grad_norm': 2.2172648906707764, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.25}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7494006156921387, 'eval_f1_micro': 0.8133971291866029, 'eval_f1_macro': 0.780980218732015, 'eval_f1_class_0': 0.8338368580060423, 'eval_f1_class_1': 0.7575757575757576, 'eval_f1_class_2': 0.8396946564885496, 'eval_f1_class_3': 0.7761194029850746, 'eval_f1_class_4': 0.6976744186046512, 'eval_runtime': 0.3085, 'eval_samples_per_second': 1355.0, 'eval_steps_per_second': 87.524, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.779068648815155, 'eval_f1_micro': 0.8301435406698564, 'eval_f1_macro': 0.8057769782401032, 'eval_f1_class_0': 0.8452380952380952, 'eval_f1_class_1': 0.782608695652174, 'eval_f1_class_2': 0.8473282442748091, 'eval_f1_class_3': 0.8095238095238095, 'eval_f1_class_4': 0.7441860465116279, 'eval_runtime': 0.3109, 'eval_samples_per_second': 1344.508, 'eval_steps_per_second': 86.846, 'epoch': 8.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8165282011032104, 'eval_f1_micro': 0.8277511961722488, 'eval_f1_macro': 0.7997075908542233, 'eval_f1_class_0': 0.8414634146341463, 'eval_f1_class_1': 0.7878787878787878, 'eval_f1_class_2': 0.8487084870848709, 'eval_f1_class_3': 0.8062015503875969, 'eval_f1_class_4': 0.7142857142857143, 'eval_runtime': 0.3027, 'eval_samples_per_second': 1381.04, 'eval_steps_per_second': 89.206, 'epoch': 9.0}
{'loss': 0.1095, 'grad_norm': 0.9014012813568115, 'learning_rate': 1.3888888888888892e-06, 'epoch': 9.38}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.8534265756607056, 'eval_f1_micro': 0.8205741626794258, 'eval_f1_macro': 0.7937412014796928, 'eval_f1_class_0': 0.8373493975903614, 'eval_f1_class_1': 0.78125, 'eval_f1_class_2': 0.835820895522388, 'eval_f1_class_3': 0.8, 'eval_f1_class_4': 0.7142857142857143, 'eval_runtime': 0.3114, 'eval_samples_per_second': 1342.13, 'eval_steps_per_second': 86.693, 'epoch': 10.0}
{'train_runtime': 83.0713, 'train_samples_per_second': 308.049, 'train_steps_per_second': 19.261, 'train_loss': 0.44194120734930037, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

In [11]:
# Calculate mean and std of F1 scores
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]  # Adjust range based on num_classes

avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }

# Print results
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")

Average test_f1_micro: 0.7713 ± 0.0087
Average test_f1_macro: 0.7384 ± 0.0157
Average test_f1_class_0: 0.8289 ± 0.0058
Average test_f1_class_1: 0.7044 ± 0.0346
Average test_f1_class_2: 0.7935 ± 0.0088
Average test_f1_class_3: 0.6558 ± 0.0142
Average test_f1_class_4: 0.7091 ± 0.0527
