# **Few shot Fine Tuning on MNLI**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/DL_project/llm_finetuning/notebooks/

/content/drive/MyDrive/DL_project/llm_finetuning/notebooks


In [3]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [4]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

data = load_dataset("glue", "mnli")

#Below function is taken from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)


#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
#!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer([f'{premise} Question: {hypothesis} Yes or No?' for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])], truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for n in n_values:
    for run in range(10):  # repeat 10 times for each n

        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Re-initialize the model at the start of each training cycle
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        model.to(device)  # Move the model to the device (gpu if available)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./offload_folder",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_mismatched"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/261802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9847 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.0749220848083496
In-domain eval_accuracy: 0.4931261207411835
In-domain eval_runtime: 43.4145
In-domain eval_samples_per_second: 154.142
In-domain eval_steps_per_second: 19.279
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 2.0219690799713135
Out-of-domain eval_accuracy: 0.4970908548411159
Out-of-domain eval_runtime: 46.9921
Out-of-domain eval_samples_per_second: 142.641
Out-of-domain eval_steps_per_second: 17.833
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 45.5882
In-domain eval_samples_per_second: 146.792
In-domain eval_steps_per_second: 18.36
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.7736
Out-of-domain eval_samples_per_second: 143.307
Out-of-domain eval_steps_per_second: 17.916
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 45.8775
In-domain eval_samples_per_second: 145.867
In-domain eval_steps_per_second: 18.244
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.7318
Out-of-domain eval_samples_per_second: 143.436
Out-of-domain eval_steps_per_second: 17.932
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 45.9947
In-domain eval_samples_per_second: 145.495
In-domain eval_steps_per_second: 18.198
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.7807
Out-of-domain eval_samples_per_second: 143.285
Out-of-domain eval_steps_per_second: 17.913
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.2375
In-domain eval_samples_per_second: 144.731
In-domain eval_steps_per_second: 18.102
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.8638
Out-of-domain eval_samples_per_second: 143.031
Out-of-domain eval_steps_per_second: 17.882
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.1412
In-domain eval_samples_per_second: 145.033
In-domain eval_steps_per_second: 18.14
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.872
Out-of-domain eval_samples_per_second: 143.007
Out-of-domain eval_steps_per_second: 17.878
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.119
In-domain eval_samples_per_second: 145.103
In-domain eval_steps_per_second: 18.149
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.9902
Out-of-domain eval_samples_per_second: 142.647
Out-of-domain eval_steps_per_second: 17.834
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.192
In-domain eval_samples_per_second: 144.874
In-domain eval_steps_per_second: 18.12
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.8825
Out-of-domain eval_samples_per_second: 142.975
Out-of-domain eval_steps_per_second: 17.874
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.1165
In-domain eval_samples_per_second: 145.111
In-domain eval_steps_per_second: 18.15
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.815
Out-of-domain eval_samples_per_second: 143.18
Out-of-domain eval_steps_per_second: 17.9
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 46.1444
In-domain eval_samples_per_second: 145.023
In-domain eval_steps_per_second: 18.139
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.9507314562797546
Out-of-domain eval_accuracy: 0.5299119797105774
Out-of-domain eval_runtime: 46.8155
Out-of-domain eval_samples_per_second: 143.179
Out-of-domain eval_steps_per_second: 17.9
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.106
In-domain eval_samples_per_second: 145.144
In-domain eval_steps_per_second: 18.154
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.7559
Out-of-domain eval_samples_per_second: 143.362
Out-of-domain eval_steps_per_second: 17.923
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.252
In-domain eval_samples_per_second: 144.686
In-domain eval_steps_per_second: 18.097
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.855
Out-of-domain eval_samples_per_second: 143.058
Out-of-domain eval_steps_per_second: 17.885
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1871
In-domain eval_samples_per_second: 144.889
In-domain eval_steps_per_second: 18.122
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.8447
Out-of-domain eval_samples_per_second: 143.09
Out-of-domain eval_steps_per_second: 17.889
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1653
In-domain eval_samples_per_second: 144.957
In-domain eval_steps_per_second: 18.131
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.8994
Out-of-domain eval_samples_per_second: 142.923
Out-of-domain eval_steps_per_second: 17.868
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1709
In-domain eval_samples_per_second: 144.94
In-domain eval_steps_per_second: 18.128
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.8797
Out-of-domain eval_samples_per_second: 142.983
Out-of-domain eval_steps_per_second: 17.876
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1256
In-domain eval_samples_per_second: 145.082
In-domain eval_steps_per_second: 18.146
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.9122
Out-of-domain eval_samples_per_second: 142.884
Out-of-domain eval_steps_per_second: 17.863
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.2068
In-domain eval_samples_per_second: 144.827
In-domain eval_steps_per_second: 18.114
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.8208
Out-of-domain eval_samples_per_second: 143.163
Out-of-domain eval_steps_per_second: 17.898
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.2283
In-domain eval_samples_per_second: 144.76
In-domain eval_steps_per_second: 18.106
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.7296
Out-of-domain eval_samples_per_second: 143.442
Out-of-domain eval_steps_per_second: 17.933
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1133
In-domain eval_samples_per_second: 145.121
In-domain eval_steps_per_second: 18.151
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.8683
Out-of-domain eval_samples_per_second: 143.018
Out-of-domain eval_steps_per_second: 17.88
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 46.1552
In-domain eval_samples_per_second: 144.989
In-domain eval_steps_per_second: 18.134
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.2005068063735962
Out-of-domain eval_accuracy: 0.5757123675965985
Out-of-domain eval_runtime: 46.9056
Out-of-domain eval_samples_per_second: 142.904
Out-of-domain eval_steps_per_second: 17.866
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.2655954360961914
In-domain eval_accuracy: 0.5603705917513448
In-domain eval_runtime: 46.1685
In-domain eval_samples_per_second: 144.947
In-domain eval_steps_per_second: 18.129
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.205007314682007
Out-of-domain eval_accuracy: 0.5794420408772192
Out-of-domain eval_runtime: 46.6761
Out-of-domain eval_samples_per_second: 143.607
Out-of-domain eval_steps_per_second: 17.954
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.2232
In-domain eval_samples_per_second: 144.776
In-domain eval_steps_per_second: 18.108
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 47.0356
Out-of-domain eval_samples_per_second: 142.509
Out-of-domain eval_steps_per_second: 17.816
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.1597
In-domain eval_samples_per_second: 144.975
In-domain eval_steps_per_second: 18.133
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.8172
Out-of-domain eval_samples_per_second: 143.174
Out-of-domain eval_steps_per_second: 17.899
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.2342
In-domain eval_samples_per_second: 144.741
In-domain eval_steps_per_second: 18.103
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.9055
Out-of-domain eval_samples_per_second: 142.904
Out-of-domain eval_steps_per_second: 17.866
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.094
In-domain eval_samples_per_second: 145.182
In-domain eval_steps_per_second: 18.159
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.9774
Out-of-domain eval_samples_per_second: 142.686
Out-of-domain eval_steps_per_second: 17.838
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.0588
In-domain eval_samples_per_second: 145.292
In-domain eval_steps_per_second: 18.172
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.807
Out-of-domain eval_samples_per_second: 143.205
Out-of-domain eval_steps_per_second: 17.903
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.1609
In-domain eval_samples_per_second: 144.971
In-domain eval_steps_per_second: 18.132
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.7736
Out-of-domain eval_samples_per_second: 143.307
Out-of-domain eval_steps_per_second: 17.916
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.1901
In-domain eval_samples_per_second: 144.879
In-domain eval_steps_per_second: 18.121
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.9886
Out-of-domain eval_samples_per_second: 142.652
Out-of-domain eval_steps_per_second: 17.834
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.1972
In-domain eval_samples_per_second: 144.857
In-domain eval_steps_per_second: 18.118
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 46.8044
Out-of-domain eval_samples_per_second: 143.213
Out-of-domain eval_steps_per_second: 17.904
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 46.2373
In-domain eval_samples_per_second: 144.732
In-domain eval_steps_per_second: 18.102
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.3802855014801025
Out-of-domain eval_accuracy: 0.5891391914068328
Out-of-domain eval_runtime: 47.1195
Out-of-domain eval_samples_per_second: 142.255
Out-of-domain eval_steps_per_second: 17.785
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.9880398511886597
In-domain eval_accuracy: 0.618350268977884
In-domain eval_runtime: 46.3394
In-domain eval_samples_per_second: 144.413
In-domain eval_steps_per_second: 18.062
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.0417709350585938
Out-of-domain eval_accuracy: 0.6350887662240787
Out-of-domain eval_runtime: 47.0268
Out-of-domain eval_samples_per_second: 142.536
Out-of-domain eval_steps_per_second: 17.82
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.0997
In-domain eval_samples_per_second: 145.163
In-domain eval_steps_per_second: 18.156
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.7985
Out-of-domain eval_samples_per_second: 143.231
Out-of-domain eval_steps_per_second: 17.907
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.0526
In-domain eval_samples_per_second: 145.312
In-domain eval_steps_per_second: 18.175
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.7626
Out-of-domain eval_samples_per_second: 143.341
Out-of-domain eval_steps_per_second: 17.92
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.249
In-domain eval_samples_per_second: 144.695
In-domain eval_steps_per_second: 18.098
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.9844
Out-of-domain eval_samples_per_second: 142.664
Out-of-domain eval_steps_per_second: 17.836
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.1954
In-domain eval_samples_per_second: 144.863
In-domain eval_steps_per_second: 18.119
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 47.0119
Out-of-domain eval_samples_per_second: 142.581
Out-of-domain eval_steps_per_second: 17.825
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.3113
In-domain eval_samples_per_second: 144.5
In-domain eval_steps_per_second: 18.073
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 47.1027
Out-of-domain eval_samples_per_second: 142.306
Out-of-domain eval_steps_per_second: 17.791
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.0793
In-domain eval_samples_per_second: 145.228
In-domain eval_steps_per_second: 18.164
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.9415
Out-of-domain eval_samples_per_second: 142.795
Out-of-domain eval_steps_per_second: 17.852
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.1346
In-domain eval_samples_per_second: 145.054
In-domain eval_steps_per_second: 18.143
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.8188
Out-of-domain eval_samples_per_second: 143.169
Out-of-domain eval_steps_per_second: 17.899
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.2622
In-domain eval_samples_per_second: 144.654
In-domain eval_steps_per_second: 18.093
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 46.986
Out-of-domain eval_samples_per_second: 142.66
Out-of-domain eval_steps_per_second: 17.835
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 46.0148
In-domain eval_samples_per_second: 145.432
In-domain eval_steps_per_second: 18.19
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.108855962753296
Out-of-domain eval_accuracy: 0.6434432343726689
Out-of-domain eval_runtime: 47.1113
Out-of-domain eval_samples_per_second: 142.28
Out-of-domain eval_steps_per_second: 17.788
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.4922378063201904
In-domain eval_accuracy: 0.6742378959952182
In-domain eval_runtime: 46.2127
In-domain eval_samples_per_second: 144.809
In-domain eval_steps_per_second: 18.112
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.413851499557495
Out-of-domain eval_accuracy: 0.6949127256452334
Out-of-domain eval_runtime: 47.0327
Out-of-domain eval_samples_per_second: 142.518
Out-of-domain eval_steps_per_second: 17.817
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.2438
In-domain eval_samples_per_second: 144.711
In-domain eval_steps_per_second: 18.1
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 47.1137
Out-of-domain eval_samples_per_second: 142.273
Out-of-domain eval_steps_per_second: 17.787
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.1451
In-domain eval_samples_per_second: 145.021
In-domain eval_steps_per_second: 18.138
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.8542
Out-of-domain eval_samples_per_second: 143.061
Out-of-domain eval_steps_per_second: 17.885
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.1811
In-domain eval_samples_per_second: 144.908
In-domain eval_steps_per_second: 18.124
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.9152
Out-of-domain eval_samples_per_second: 142.875
Out-of-domain eval_steps_per_second: 17.862
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.2318
In-domain eval_samples_per_second: 144.749
In-domain eval_steps_per_second: 18.104
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.8824
Out-of-domain eval_samples_per_second: 142.975
Out-of-domain eval_steps_per_second: 17.875
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.1369
In-domain eval_samples_per_second: 145.046
In-domain eval_steps_per_second: 18.142
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.8947
Out-of-domain eval_samples_per_second: 142.937
Out-of-domain eval_steps_per_second: 17.87
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.1657
In-domain eval_samples_per_second: 144.956
In-domain eval_steps_per_second: 18.13
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.9567
Out-of-domain eval_samples_per_second: 142.749
Out-of-domain eval_steps_per_second: 17.846
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.0301
In-domain eval_samples_per_second: 145.383
In-domain eval_steps_per_second: 18.184
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.7824
Out-of-domain eval_samples_per_second: 143.28
Out-of-domain eval_steps_per_second: 17.913
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.0334
In-domain eval_samples_per_second: 145.373
In-domain eval_steps_per_second: 18.182
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.9369
Out-of-domain eval_samples_per_second: 142.809
Out-of-domain eval_steps_per_second: 17.854
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 46.132
In-domain eval_samples_per_second: 145.062
In-domain eval_steps_per_second: 18.144
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.435373306274414
Out-of-domain eval_accuracy: 0.6567208712516783
Out-of-domain eval_runtime: 46.8881
Out-of-domain eval_samples_per_second: 142.957
Out-of-domain eval_steps_per_second: 17.872
Out-of-domain epoch: 40.0


In [5]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.5190824865511058
Maximum in-domain accuracy: 0.5219665271966527
Average out-of-domain accuracy: 0.5266298672236311
Maximum out-of-domain accuracy: 0.5299119797105774


For n=16:
Average in-domain accuracy: 0.5566347878063359
Maximum in-domain accuracy: 0.5566347878063359
Average out-of-domain accuracy: 0.5757123675965985
Maximum out-of-domain accuracy: 0.5757123675965985


For n=32:
Average in-domain accuracy: 0.5740884638374179
Maximum in-domain accuracy: 0.5756126718469815
Average out-of-domain accuracy: 0.5881694763538714
Maximum out-of-domain accuracy: 0.5891391914068328


For n=64:
Average in-domain accuracy: 0.6171398684997011
Maximum in-domain accuracy: 0.618350268977884
Average out-of-domain accuracy: 0.6426077875578099
Maximum out-of-domain accuracy: 0.6434432343726689


For n=128:
Average in-domain accuracy: 0.6427674835624626
Maximum in-domain accuracy: 0.6742378959952182
Average out-of-domain accuracy: 0.6605400566910339
Maximum out-of

In [6]:
# Save the DataFrame to a CSV file
results_df.to_csv("../Results/pbft_mnli_baseline_results.csv", index=False)