# **Few shot Fine Tuning on MNLI**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/DL_project/llm_finetuning/notebooks/

/content/drive/MyDrive/DL_project/llm_finetuning/notebooks


In [None]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [None]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

data = load_dataset("glue", "mnli")
hans_data = load_dataset("hans")
#print(hans_data.keys())

#Below function is taken from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)

#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
#!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer([f'{premise} Question: {hypothesis} Yes or No?' for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])], truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)
hans_data = hans_data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for n in n_values:
    for run in range(10):  # repeat 10 times for each n

        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Re-initialize the model at the start of each training cycle
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        model.to(device)  # Move the model to the device (gpu if available)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./offload_folder",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance using HANS dataset
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=hans_data["validation"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.0749220848083496
In-domain eval_accuracy: 0.4931261207411835
In-domain eval_runtime: 43.7079
In-domain eval_samples_per_second: 153.107
In-domain eval_steps_per_second: 19.15
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.7965325117111206
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 201.998
Out-of-domain eval_samples_per_second: 148.516
Out-of-domain eval_steps_per_second: 18.565
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.656
In-domain eval_samples_per_second: 149.857
In-domain eval_steps_per_second: 18.743
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.8755
Out-of-domain eval_samples_per_second: 148.606
Out-of-domain eval_steps_per_second: 18.576
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6923
In-domain eval_samples_per_second: 149.735
In-domain eval_steps_per_second: 18.728
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.7595
Out-of-domain eval_samples_per_second: 148.692
Out-of-domain eval_steps_per_second: 18.586
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.7097
In-domain eval_samples_per_second: 149.677
In-domain eval_steps_per_second: 18.721
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.8927
Out-of-domain eval_samples_per_second: 148.594
Out-of-domain eval_steps_per_second: 18.574
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6532
In-domain eval_samples_per_second: 149.866
In-domain eval_steps_per_second: 18.744
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.9264
Out-of-domain eval_samples_per_second: 148.569
Out-of-domain eval_steps_per_second: 18.571
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.7104
In-domain eval_samples_per_second: 149.674
In-domain eval_steps_per_second: 18.72
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 202.1127
Out-of-domain eval_samples_per_second: 148.432
Out-of-domain eval_steps_per_second: 18.554
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6755
In-domain eval_samples_per_second: 149.791
In-domain eval_steps_per_second: 18.735
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 202.0983
Out-of-domain eval_samples_per_second: 148.443
Out-of-domain eval_steps_per_second: 18.555
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6706
In-domain eval_samples_per_second: 149.808
In-domain eval_steps_per_second: 18.737
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.7465
Out-of-domain eval_samples_per_second: 148.701
Out-of-domain eval_steps_per_second: 18.588
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6915
In-domain eval_samples_per_second: 149.738
In-domain eval_steps_per_second: 18.728
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.7876
Out-of-domain eval_samples_per_second: 148.671
Out-of-domain eval_steps_per_second: 18.584
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.0014705657958984
In-domain eval_accuracy: 0.5219665271966527
In-domain eval_runtime: 44.6726
In-domain eval_samples_per_second: 149.801
In-domain eval_steps_per_second: 18.736
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7827005386352539
Out-of-domain eval_accuracy: 0.4927
Out-of-domain eval_runtime: 201.8173
Out-of-domain eval_samples_per_second: 148.649
Out-of-domain eval_steps_per_second: 18.581
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7232
In-domain eval_samples_per_second: 149.631
In-domain eval_steps_per_second: 18.715
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.1088
Out-of-domain eval_samples_per_second: 148.435
Out-of-domain eval_steps_per_second: 18.554
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7539
In-domain eval_samples_per_second: 149.529
In-domain eval_steps_per_second: 18.702
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.0759
Out-of-domain eval_samples_per_second: 148.459
Out-of-domain eval_steps_per_second: 18.557
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7227
In-domain eval_samples_per_second: 149.633
In-domain eval_steps_per_second: 18.715
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.1806
Out-of-domain eval_samples_per_second: 148.382
Out-of-domain eval_steps_per_second: 18.548
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7713
In-domain eval_samples_per_second: 149.471
In-domain eval_steps_per_second: 18.695
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.1369
Out-of-domain eval_samples_per_second: 148.414
Out-of-domain eval_steps_per_second: 18.552
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.723
In-domain eval_samples_per_second: 149.632
In-domain eval_steps_per_second: 18.715
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 201.9646
Out-of-domain eval_samples_per_second: 148.541
Out-of-domain eval_steps_per_second: 18.568
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7369
In-domain eval_samples_per_second: 149.586
In-domain eval_steps_per_second: 18.709
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 201.9936
Out-of-domain eval_samples_per_second: 148.52
Out-of-domain eval_steps_per_second: 18.565
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7206
In-domain eval_samples_per_second: 149.64
In-domain eval_steps_per_second: 18.716
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.1486
Out-of-domain eval_samples_per_second: 148.406
Out-of-domain eval_steps_per_second: 18.551
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7407
In-domain eval_samples_per_second: 149.573
In-domain eval_steps_per_second: 18.708
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 202.0677
Out-of-domain eval_samples_per_second: 148.465
Out-of-domain eval_steps_per_second: 18.558
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7456
In-domain eval_samples_per_second: 149.557
In-domain eval_steps_per_second: 18.706
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 201.7779
Out-of-domain eval_samples_per_second: 148.678
Out-of-domain eval_steps_per_second: 18.585
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.1177140474319458
In-domain eval_accuracy: 0.5566347878063359
In-domain eval_runtime: 44.7691
In-domain eval_samples_per_second: 149.478
In-domain eval_steps_per_second: 18.696
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.322904348373413
Out-of-domain eval_accuracy: 0.4991333333333333
Out-of-domain eval_runtime: 201.9848
Out-of-domain eval_samples_per_second: 148.526
Out-of-domain eval_steps_per_second: 18.566
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.2655954360961914
In-domain eval_accuracy: 0.5603705917513448
In-domain eval_runtime: 44.72
In-domain eval_samples_per_second: 149.642
In-domain eval_steps_per_second: 18.716
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.422755241394043
Out-of-domain eval_accuracy: 0.499
Out-of-domain eval_runtime: 202.0686
Out-of-domain eval_samples_per_second: 148.464
Out-of-domain eval_steps_per_second: 18.558
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.6917
In-domain eval_samples_per_second: 149.737
In-domain eval_steps_per_second: 18.728
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 201.9796
Out-of-domain eval_samples_per_second: 148.53
Out-of-domain eval_steps_per_second: 18.566
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.6775
In-domain eval_samples_per_second: 149.785
In-domain eval_steps_per_second: 18.734
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 202.1557
Out-of-domain eval_samples_per_second: 148.4
Out-of-domain eval_steps_per_second: 18.55
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.7113
In-domain eval_samples_per_second: 149.671
In-domain eval_steps_per_second: 18.72
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 202.1746
Out-of-domain eval_samples_per_second: 148.387
Out-of-domain eval_steps_per_second: 18.548
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.7726
In-domain eval_samples_per_second: 149.466
In-domain eval_steps_per_second: 18.694
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 201.9316
Out-of-domain eval_samples_per_second: 148.565
Out-of-domain eval_steps_per_second: 18.571
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.7159
In-domain eval_samples_per_second: 149.656
In-domain eval_steps_per_second: 18.718
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 201.7547
Out-of-domain eval_samples_per_second: 148.695
Out-of-domain eval_steps_per_second: 18.587
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.6611
In-domain eval_samples_per_second: 149.84
In-domain eval_steps_per_second: 18.741
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 202.0822
Out-of-domain eval_samples_per_second: 148.454
Out-of-domain eval_steps_per_second: 18.557
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.8122
In-domain eval_samples_per_second: 149.334
In-domain eval_steps_per_second: 18.678
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 202.2646
Out-of-domain eval_samples_per_second: 148.321
Out-of-domain eval_steps_per_second: 18.54
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.7409
In-domain eval_samples_per_second: 149.572
In-domain eval_steps_per_second: 18.708
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 201.875
Out-of-domain eval_samples_per_second: 148.607
Out-of-domain eval_steps_per_second: 18.576
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.3906240463256836
In-domain eval_accuracy: 0.5756126718469815
In-domain eval_runtime: 44.6875
In-domain eval_samples_per_second: 149.751
In-domain eval_steps_per_second: 18.73
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.525134563446045
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 202.202
Out-of-domain eval_samples_per_second: 148.366
Out-of-domain eval_steps_per_second: 18.546
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.9880398511886597
In-domain eval_accuracy: 0.618350268977884
In-domain eval_runtime: 44.6988
In-domain eval_samples_per_second: 149.713
In-domain eval_steps_per_second: 18.725
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.546670913696289
Out-of-domain eval_accuracy: 0.4996333333333333
Out-of-domain eval_runtime: 201.9361
Out-of-domain eval_samples_per_second: 148.562
Out-of-domain eval_steps_per_second: 18.57
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6546
In-domain eval_samples_per_second: 149.861
In-domain eval_steps_per_second: 18.744
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.1504
Out-of-domain eval_samples_per_second: 148.404
Out-of-domain eval_steps_per_second: 18.551
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6635
In-domain eval_samples_per_second: 149.831
In-domain eval_steps_per_second: 18.74
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.0759
Out-of-domain eval_samples_per_second: 148.459
Out-of-domain eval_steps_per_second: 18.557
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.7261
In-domain eval_samples_per_second: 149.622
In-domain eval_steps_per_second: 18.714
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 201.9591
Out-of-domain eval_samples_per_second: 148.545
Out-of-domain eval_steps_per_second: 18.568
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6834
In-domain eval_samples_per_second: 149.765
In-domain eval_steps_per_second: 18.732
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.133
Out-of-domain eval_samples_per_second: 148.417
Out-of-domain eval_steps_per_second: 18.552
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.7108
In-domain eval_samples_per_second: 149.673
In-domain eval_steps_per_second: 18.72
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 201.9154
Out-of-domain eval_samples_per_second: 148.577
Out-of-domain eval_steps_per_second: 18.572
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6554
In-domain eval_samples_per_second: 149.859
In-domain eval_steps_per_second: 18.744
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.0687
Out-of-domain eval_samples_per_second: 148.464
Out-of-domain eval_steps_per_second: 18.558
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6736
In-domain eval_samples_per_second: 149.798
In-domain eval_steps_per_second: 18.736
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 201.9305
Out-of-domain eval_samples_per_second: 148.566
Out-of-domain eval_steps_per_second: 18.571
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6513
In-domain eval_samples_per_second: 149.873
In-domain eval_steps_per_second: 18.745
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.0526
Out-of-domain eval_samples_per_second: 148.476
Out-of-domain eval_steps_per_second: 18.56
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2689998149871826
In-domain eval_accuracy: 0.6170053795576809
In-domain eval_runtime: 44.6665
In-domain eval_samples_per_second: 149.821
In-domain eval_steps_per_second: 18.739
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 5.408353805541992
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.0512
Out-of-domain eval_samples_per_second: 148.477
Out-of-domain eval_steps_per_second: 18.56
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.4922378063201904
In-domain eval_accuracy: 0.6742378959952182
In-domain eval_runtime: 44.6657
In-domain eval_samples_per_second: 149.824
In-domain eval_steps_per_second: 18.739
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 6.620100498199463
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 202.1877
Out-of-domain eval_samples_per_second: 148.377
Out-of-domain eval_steps_per_second: 18.547
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7464
In-domain eval_samples_per_second: 149.554
In-domain eval_steps_per_second: 18.705
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 201.9943
Out-of-domain eval_samples_per_second: 148.519
Out-of-domain eval_steps_per_second: 18.565
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7355
In-domain eval_samples_per_second: 149.591
In-domain eval_steps_per_second: 18.71
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.1992
Out-of-domain eval_samples_per_second: 148.369
Out-of-domain eval_steps_per_second: 18.546
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7394
In-domain eval_samples_per_second: 149.577
In-domain eval_steps_per_second: 18.708
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.0355
Out-of-domain eval_samples_per_second: 148.489
Out-of-domain eval_steps_per_second: 18.561
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.6259
In-domain eval_samples_per_second: 149.958
In-domain eval_steps_per_second: 18.756
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.0047
Out-of-domain eval_samples_per_second: 148.511
Out-of-domain eval_steps_per_second: 18.564
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.6371
In-domain eval_samples_per_second: 149.92
In-domain eval_steps_per_second: 18.751
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.0722
Out-of-domain eval_samples_per_second: 148.462
Out-of-domain eval_steps_per_second: 18.558
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7085
In-domain eval_samples_per_second: 149.681
In-domain eval_steps_per_second: 18.721
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.1344
Out-of-domain eval_samples_per_second: 148.416
Out-of-domain eval_steps_per_second: 18.552
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7151
In-domain eval_samples_per_second: 149.659
In-domain eval_steps_per_second: 18.719
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.2978
Out-of-domain eval_samples_per_second: 148.296
Out-of-domain eval_steps_per_second: 18.537
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7248
In-domain eval_samples_per_second: 149.626
In-domain eval_steps_per_second: 18.714
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.0892
Out-of-domain eval_samples_per_second: 148.449
Out-of-domain eval_steps_per_second: 18.556
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.5386970043182373
In-domain eval_accuracy: 0.6392707710699342
In-domain eval_runtime: 44.7026
In-domain eval_samples_per_second: 149.701
In-domain eval_steps_per_second: 18.724
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 5.317244052886963
Out-of-domain eval_accuracy: 0.5012666666666666
Out-of-domain eval_runtime: 202.3504
Out-of-domain eval_samples_per_second: 148.258
Out-of-domain eval_steps_per_second: 18.532
Out-of-domain epoch: 40.0


In [None]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.5190824865511058
Maximum in-domain accuracy: 0.5219665271966527
Average out-of-domain accuracy: 0.49355666666666664
Maximum out-of-domain accuracy: 0.5012666666666666


For n=16:
Average in-domain accuracy: 0.5566347878063359
Maximum in-domain accuracy: 0.5566347878063359
Average out-of-domain accuracy: 0.4991333333333333
Maximum out-of-domain accuracy: 0.4991333333333333


For n=32:
Average in-domain accuracy: 0.5740884638374179
Maximum in-domain accuracy: 0.5756126718469815
Average out-of-domain accuracy: 0.49993
Maximum out-of-domain accuracy: 0.5000333333333333


For n=64:
Average in-domain accuracy: 0.6171398684997011
Maximum in-domain accuracy: 0.618350268977884
Average out-of-domain accuracy: 0.49996333333333337
Maximum out-of-domain accuracy: 0.5


For n=128:
Average in-domain accuracy: 0.6427674835624626
Maximum in-domain accuracy: 0.6742378959952182
Average out-of-domain accuracy: 0.50114
Maximum out-of-domain accuracy: 0.501266666666666

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv("../Results/pbft_mnli_baseline_hansOOD.csv", index=False)