# **Few shot Fine Tuning on MNLI**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/DL_project/llm_finetuning/notebooks/

/content/drive/MyDrive/DL_project/llm_finetuning/notebooks


In [3]:
!pip install -q transformers accelerate bitsandbytes datasets

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [4]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

data = load_dataset("glue", "mnli")

#Below function is taken from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)


#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
#!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer(batch["premise"], batch["hypothesis"],
                         truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for n in n_values:
    for run in range(10):  # repeat 10 times for each n

        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Re-initialize the model at the start of each training cycle
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        model.to(device)  # Move the model to the device (gpu if available)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./offload_folder",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_mismatched"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/261802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9847 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.3966827392578125
In-domain eval_accuracy: 0.49222952779438134
In-domain eval_runtime: 74.5082
In-domain eval_samples_per_second: 89.816
In-domain eval_steps_per_second: 11.234
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3621662855148315
Out-of-domain eval_accuracy: 0.49500223780396835
Out-of-domain eval_runtime: 86.8924
Out-of-domain eval_samples_per_second: 77.141
Out-of-domain eval_steps_per_second: 9.644
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 82.6139
In-domain eval_samples_per_second: 81.003
In-domain eval_steps_per_second: 10.131
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 83.4639
Out-of-domain eval_samples_per_second: 80.31
Out-of-domain eval_steps_per_second: 10.04
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 90.7994
In-domain eval_samples_per_second: 73.701
In-domain eval_steps_per_second: 9.218
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 82.6296
Out-of-domain eval_samples_per_second: 81.121
Out-of-domain eval_steps_per_second: 10.142
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 85.0492
In-domain eval_samples_per_second: 78.684
In-domain eval_steps_per_second: 9.841
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 87.3131
Out-of-domain eval_samples_per_second: 76.77
Out-of-domain eval_steps_per_second: 9.598
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 82.3415
In-domain eval_samples_per_second: 81.271
In-domain eval_steps_per_second: 10.165
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 83.828
Out-of-domain eval_samples_per_second: 79.961
Out-of-domain eval_steps_per_second: 9.997
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 85.4233
In-domain eval_samples_per_second: 78.339
In-domain eval_steps_per_second: 9.798
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 87.8166
Out-of-domain eval_samples_per_second: 76.33
Out-of-domain eval_steps_per_second: 9.543
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 91.5287
In-domain eval_samples_per_second: 73.114
In-domain eval_steps_per_second: 9.145
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 83.6234
Out-of-domain eval_samples_per_second: 80.157
Out-of-domain eval_steps_per_second: 10.021
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 84.8847
In-domain eval_samples_per_second: 78.836
In-domain eval_steps_per_second: 9.86
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 87.1475
Out-of-domain eval_samples_per_second: 76.916
Out-of-domain eval_steps_per_second: 9.616
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 85.6851
In-domain eval_samples_per_second: 78.1
In-domain eval_steps_per_second: 9.768
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 85.1466
Out-of-domain eval_samples_per_second: 78.723
Out-of-domain eval_steps_per_second: 9.842
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7628675103187561
In-domain eval_accuracy: 0.551255230125523
In-domain eval_runtime: 82.8241
In-domain eval_samples_per_second: 80.798
In-domain eval_steps_per_second: 10.106
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 0.7241039276123047
Out-of-domain eval_accuracy: 0.5725794420408772
Out-of-domain eval_runtime: 83.3744
Out-of-domain eval_samples_per_second: 80.396
Out-of-domain eval_steps_per_second: 10.051
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 88.3455
In-domain eval_samples_per_second: 75.748
In-domain eval_steps_per_second: 9.474
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 86.3293
Out-of-domain eval_samples_per_second: 77.645
Out-of-domain eval_steps_per_second: 9.707
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 86.9963
In-domain eval_samples_per_second: 76.923
In-domain eval_steps_per_second: 9.621
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 85.597
Out-of-domain eval_samples_per_second: 78.309
Out-of-domain eval_steps_per_second: 9.79
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 86.2288
In-domain eval_samples_per_second: 77.608
In-domain eval_steps_per_second: 9.707
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 86.074
Out-of-domain eval_samples_per_second: 77.875
Out-of-domain eval_steps_per_second: 9.736
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 89.4051
In-domain eval_samples_per_second: 74.85
In-domain eval_steps_per_second: 9.362
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 84.794
Out-of-domain eval_samples_per_second: 79.05
Out-of-domain eval_steps_per_second: 9.883
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 85.3024
In-domain eval_samples_per_second: 78.45
In-domain eval_steps_per_second: 9.812
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 85.1142
Out-of-domain eval_samples_per_second: 78.753
Out-of-domain eval_steps_per_second: 9.846
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 85.6699
In-domain eval_samples_per_second: 78.114
In-domain eval_steps_per_second: 9.77
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 85.7626
Out-of-domain eval_samples_per_second: 78.158
Out-of-domain eval_steps_per_second: 9.771
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 86.047
In-domain eval_samples_per_second: 77.771
In-domain eval_steps_per_second: 9.727
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 85.7493
Out-of-domain eval_samples_per_second: 78.17
Out-of-domain eval_steps_per_second: 9.773
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 86.6669
In-domain eval_samples_per_second: 77.215
In-domain eval_steps_per_second: 9.658
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 85.6481
Out-of-domain eval_samples_per_second: 78.262
Out-of-domain eval_steps_per_second: 9.784
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 82.2393
In-domain eval_samples_per_second: 81.372
In-domain eval_steps_per_second: 10.178
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 88.3931
Out-of-domain eval_samples_per_second: 75.832
Out-of-domain eval_steps_per_second: 9.48
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 85.3587
In-domain eval_samples_per_second: 78.399
In-domain eval_steps_per_second: 9.806
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 0.8020033240318298
Out-of-domain eval_accuracy: 0.6033119498731911
Out-of-domain eval_runtime: 88.3092
Out-of-domain eval_samples_per_second: 75.904
Out-of-domain eval_steps_per_second: 9.489
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9722184538841248
In-domain eval_accuracy: 0.6243275552898984
In-domain eval_runtime: 89.4979
In-domain eval_samples_per_second: 74.773
In-domain eval_steps_per_second: 9.352
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 0.9252219200134277
Out-of-domain eval_accuracy: 0.6356855139489781
Out-of-domain eval_runtime: 86.9637
Out-of-domain eval_samples_per_second: 77.078
Out-of-domain eval_steps_per_second: 9.636
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 88.3424
In-domain eval_samples_per_second: 75.751
In-domain eval_steps_per_second: 9.475
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 88.893
Out-of-domain eval_samples_per_second: 75.405
Out-of-domain eval_steps_per_second: 9.427
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 84.0618
In-domain eval_samples_per_second: 79.608
In-domain eval_steps_per_second: 9.957
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 87.834
Out-of-domain eval_samples_per_second: 76.314
Out-of-domain eval_steps_per_second: 9.541
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 88.3272
In-domain eval_samples_per_second: 75.764
In-domain eval_steps_per_second: 9.476
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 89.7641
Out-of-domain eval_samples_per_second: 74.674
Out-of-domain eval_steps_per_second: 9.336
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 89.5803
In-domain eval_samples_per_second: 74.704
In-domain eval_steps_per_second: 9.344
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 88.1633
Out-of-domain eval_samples_per_second: 76.029
Out-of-domain eval_steps_per_second: 9.505
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 80.9538
In-domain eval_samples_per_second: 82.664
In-domain eval_steps_per_second: 10.339
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 88.3945
Out-of-domain eval_samples_per_second: 75.83
Out-of-domain eval_steps_per_second: 9.48
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 92.7697
In-domain eval_samples_per_second: 72.136
In-domain eval_steps_per_second: 9.022
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 87.7327
Out-of-domain eval_samples_per_second: 76.403
Out-of-domain eval_steps_per_second: 9.552
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 87.8381
In-domain eval_samples_per_second: 76.186
In-domain eval_steps_per_second: 9.529
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 90.9512
Out-of-domain eval_samples_per_second: 73.699
Out-of-domain eval_steps_per_second: 9.214
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 89.6964
In-domain eval_samples_per_second: 74.607
In-domain eval_steps_per_second: 9.331
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 88.9401
Out-of-domain eval_samples_per_second: 75.365
Out-of-domain eval_steps_per_second: 9.422
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0862207412719727
In-domain eval_accuracy: 0.6325463239689181
In-domain eval_runtime: 81.2097
In-domain eval_samples_per_second: 82.404
In-domain eval_steps_per_second: 10.307
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 1.0551012754440308
Out-of-domain eval_accuracy: 0.6468745337908399
Out-of-domain eval_runtime: 87.2041
Out-of-domain eval_samples_per_second: 76.866
Out-of-domain eval_steps_per_second: 9.61
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.3385143280029297
In-domain eval_accuracy: 0.682904961147639
In-domain eval_runtime: 91.095
In-domain eval_samples_per_second: 73.462
In-domain eval_steps_per_second: 9.188
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3066551685333252
Out-of-domain eval_accuracy: 0.7020736983440251
Out-of-domain eval_runtime: 89.4201
Out-of-domain eval_samples_per_second: 74.961
Out-of-domain eval_steps_per_second: 9.371
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 89.8104
In-domain eval_samples_per_second: 74.513
In-domain eval_steps_per_second: 9.32
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 89.8973
Out-of-domain eval_samples_per_second: 74.563
Out-of-domain eval_steps_per_second: 9.322
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 90.5615
In-domain eval_samples_per_second: 73.895
In-domain eval_steps_per_second: 9.242
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 91.5002
Out-of-domain eval_samples_per_second: 73.257
Out-of-domain eval_steps_per_second: 9.158
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 89.0145
In-domain eval_samples_per_second: 75.179
In-domain eval_steps_per_second: 9.403
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 91.5553
Out-of-domain eval_samples_per_second: 73.213
Out-of-domain eval_steps_per_second: 9.153
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 88.9505
In-domain eval_samples_per_second: 75.233
In-domain eval_steps_per_second: 9.41
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 90.2184
Out-of-domain eval_samples_per_second: 74.298
Out-of-domain eval_steps_per_second: 9.289
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 93.3286
In-domain eval_samples_per_second: 71.704
In-domain eval_steps_per_second: 8.968
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 91.7153
Out-of-domain eval_samples_per_second: 73.085
Out-of-domain eval_steps_per_second: 9.137
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 91.1276
In-domain eval_samples_per_second: 73.436
In-domain eval_steps_per_second: 9.185
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 94.2201
Out-of-domain eval_samples_per_second: 71.142
Out-of-domain eval_steps_per_second: 8.894
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 92.675
In-domain eval_samples_per_second: 72.209
In-domain eval_steps_per_second: 9.032
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 88.2187
Out-of-domain eval_samples_per_second: 75.982
Out-of-domain eval_steps_per_second: 9.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 91.5857
In-domain eval_samples_per_second: 73.068
In-domain eval_steps_per_second: 9.139
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 92.8208
Out-of-domain eval_samples_per_second: 72.214
Out-of-domain eval_steps_per_second: 9.028
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4338853359222412
In-domain eval_accuracy: 0.6445008965929468
In-domain eval_runtime: 92.6318
In-domain eval_samples_per_second: 72.243
In-domain eval_steps_per_second: 9.036
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 1.3283575773239136
Out-of-domain eval_accuracy: 0.6634342831567954
Out-of-domain eval_runtime: 95.3012
Out-of-domain eval_samples_per_second: 70.335
Out-of-domain eval_steps_per_second: 8.793
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8567631244659424
In-domain eval_accuracy: 0.6876867901972504
In-domain eval_runtime: 87.933
In-domain eval_samples_per_second: 76.103
In-domain eval_steps_per_second: 9.519
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.7403194904327393
Out-of-domain eval_accuracy: 0.7077428017305684
Out-of-domain eval_runtime: 94.6513
Out-of-domain eval_samples_per_second: 70.818
Out-of-domain eval_steps_per_second: 8.854
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 93.2293
In-domain eval_samples_per_second: 71.78
In-domain eval_steps_per_second: 8.978
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 91.259
Out-of-domain eval_samples_per_second: 73.45
Out-of-domain eval_steps_per_second: 9.183
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 91.9568
In-domain eval_samples_per_second: 72.773
In-domain eval_steps_per_second: 9.102
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 92.0803
Out-of-domain eval_samples_per_second: 72.795
Out-of-domain eval_steps_per_second: 9.101
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 93.1274
In-domain eval_samples_per_second: 71.859
In-domain eval_steps_per_second: 8.988
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 94.62
Out-of-domain eval_samples_per_second: 70.841
Out-of-domain eval_steps_per_second: 8.856
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 90.8253
In-domain eval_samples_per_second: 73.68
In-domain eval_steps_per_second: 9.215
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 93.0824
Out-of-domain eval_samples_per_second: 72.011
Out-of-domain eval_steps_per_second: 9.003
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 96.9414
In-domain eval_samples_per_second: 69.031
In-domain eval_steps_per_second: 8.634
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 94.3704
Out-of-domain eval_samples_per_second: 71.029
Out-of-domain eval_steps_per_second: 8.88
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 92.6803
In-domain eval_samples_per_second: 72.205
In-domain eval_steps_per_second: 9.031
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 95.0608
Out-of-domain eval_samples_per_second: 70.513
Out-of-domain eval_steps_per_second: 8.815
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 97.2872
In-domain eval_samples_per_second: 68.786
In-domain eval_steps_per_second: 8.603
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 92.2442
Out-of-domain eval_samples_per_second: 72.666
Out-of-domain eval_steps_per_second: 9.085
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 93.0805
In-domain eval_samples_per_second: 71.895
In-domain eval_steps_per_second: 8.992
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 93.6535
Out-of-domain eval_samples_per_second: 71.572
Out-of-domain eval_steps_per_second: 8.948
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.6755025386810303
In-domain eval_accuracy: 0.6842498505678422
In-domain eval_runtime: 92.6919
In-domain eval_samples_per_second: 72.196
In-domain eval_steps_per_second: 9.03
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 1.4830271005630493
Out-of-domain eval_accuracy: 0.7153513352230345
Out-of-domain eval_runtime: 95.6996
Out-of-domain eval_samples_per_second: 70.042
Out-of-domain eval_steps_per_second: 8.757
Out-of-domain epoch: 40.0


In [5]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.5453526598924088
Maximum in-domain accuracy: 0.551255230125523
Average out-of-domain accuracy: 0.5648217216171864
Maximum out-of-domain accuracy: 0.5725794420408772


For n=16:
Average in-domain accuracy: 0.5932456664674238
Maximum in-domain accuracy: 0.5932456664674238
Average out-of-domain accuracy: 0.6033119498731911
Maximum out-of-domain accuracy: 0.6033119498731911


For n=32:
Average in-domain accuracy: 0.6317244471010162
Maximum in-domain accuracy: 0.6325463239689181
Average out-of-domain accuracy: 0.6457556318066537
Maximum out-of-domain accuracy: 0.6468745337908399


For n=64:
Average in-domain accuracy: 0.6483413030484161
Maximum in-domain accuracy: 0.682904961147639
Average out-of-domain accuracy: 0.6672982246755184
Maximum out-of-domain accuracy: 0.7020736983440251


For n=128:
Average in-domain accuracy: 0.6845935445307829
Maximum in-domain accuracy: 0.6876867901972504
Average out-of-domain accuracy: 0.7145904818737879
Maximum out-of-

In [7]:
!ls

offload_folder


In [8]:
results_df.to_csv("./vanilla_mnli_baseline_results.csv", index=False)

In [9]:
results_df

Unnamed: 0,n,run,in_domain_accuracy,out_of_domain_accuracy
0,2,0,0.49223,0.495002
1,2,1,0.551255,0.572579
2,2,2,0.551255,0.572579
3,2,3,0.551255,0.572579
4,2,4,0.551255,0.572579
5,2,5,0.551255,0.572579
6,2,6,0.551255,0.572579
7,2,7,0.551255,0.572579
8,2,8,0.551255,0.572579
9,2,9,0.551255,0.572579
