# Few shot Fine Tuning on Cola Data Set - Baseline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd '/content/drive/MyDrive/LLM/llm_finetuning/notebooks'
!ls

/content/drive/MyDrive/LLM/llm_finetuning/notebooks
pbft_cola_baseline.ipynb	      results				vanilla_cola_baseline.ipynb
pbft_mnli_baseline.ipynb	      vanilla_cola_adaptive.ipynb	vanilla_mnli_baseline.ipynb
pre_trained_opt_with_inference.ipynb  vanilla_cola_baseline_350M.ipynb


In [4]:
curr_filename = "vanilla_cola_adaptive"

In [5]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# Version 2 04/19/2024

from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig
import numpy as np
import pandas as pd
import torch

# Set seed, load COLA dataset

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

in_domain_data = load_dataset("glue", "cola")


# Define model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)


# Freeze Layers
for name, param in model.named_parameters():
    parts = name.split('.')
    # Find the part that is a digit and represents the layer index
    for part in parts:
        if part.isdigit():  # Check if the part is a digit
            layer_index = int(part)
            if layer_index < 10:  # Freeze all except the last two layers
                param.requires_grad = False
            break  # Once you find the index and process it, break out of the loop




# Function to load and parse out-of-domain COLA dataset
'''
\cite: https://github.com/uds-lsv/llmft/blob/main/task_utils.py
'''
def load_cola_ood_dataset(path, label=None, cache_dir=None):
    data_files = {"validation": path}
    dataset = load_dataset("csv", data_files=data_files, sep="\t", column_names=[
                           'code', 'label', 'annotation', 'sentence'], cache_dir=cache_dir)
    dataset = dataset["validation"]

    # cola-ood comes without indices, so we add them
    indices = list(range(len(dataset)))
    dataset = dataset.add_column(name="idx", column=indices)

    subset = "cola-ood"

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'acceptable' if label == 1 else 'unacceptable'}"

    return dataset, subset


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


def manipulate_inputs_for_cola_with_prompt(inputs):
    # Add pattern/prompts
    #inputs = tokenizer(["Yes or No?" + sentence for sentence in inputs["sentence"]], truncation=True, padding="max_length", max_length=128)
    inputs = tokenizer(inputs["sentence"], truncation=True, padding="max_length", max_length=128)

    return inputs

# augment in-domain-data
in_domain_data = in_domain_data.map(manipulate_inputs_for_cola_with_prompt, batched=True)

# augment out of domain data
eval_ood_data, _ = load_cola_ood_dataset(path='../datafiles/dev.tsv')
eval_ood_data = eval_ood_data.map(manipulate_inputs_for_cola_with_prompt, batched=True)


# Define parameters for training experiments (per reference paper)

few_shot_sample_size = [2, 16, 32, 64, 128]  # number of examples for each class
num_epochs = 40
batch_size = 32
learning_rate = 1e-5
weight_decay = 0.
warmup_ratio = 0.1
num_runs = 10
optimizer = AdamW(model.parameters(), lr=learning_rate) # AdamW optimizer



results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])

for n in few_shot_sample_size:
    for run_idx in range(num_runs):  # repeat 10 times for each n
        # re-iniialize model for each run
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        optimizer = AdamW(model.parameters(), lr=learning_rate)

        # Freeze Layers

        for name, param in model.named_parameters():
            parts = name.split('.')
            # Find the part that is a digit and represents the layer index
            for part in parts:
                if part.isdigit():  # Check if the part is a digit
                    layer_index = int(part)
                    if layer_index < 10:  # Freeze all except the last two layers
                         param.requires_grad = False
                    break  # Once you find the index and process it, break out of the loop





        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(in_domain_data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(in_domain_data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = in_domain_data["train"].select(indices)

        # Define training config
        # Total steps = (#samples/batch size) * epochs
        total_steps = (len(train_dataset) // batch_size) * num_epochs

        training_args = TrainingArguments(
            output_dir = "./results",
            overwrite_output_dir = True,
            num_train_epochs = num_epochs,
            per_device_train_batch_size = batch_size,
            learning_rate = learning_rate,
            weight_decay = weight_decay,
            save_steps = 10_000,
            save_total_limit = 2,
            warmup_steps = int(warmup_ratio * total_steps),
        )

        # Define the trainer
        trainer = Trainer(
            model = model,
            args=training_args,
            train_dataset = train_dataset,
            compute_metrics = compute_metrics,
            optimizers=(optimizer, None),
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=in_domain_data["validation"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=eval_ood_data)

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run_idx],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/516 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8646273016929626
In-domain eval_accuracy: 0.3835091083413231
In-domain eval_runtime: 6.4759
In-domain eval_samples_per_second: 161.058
In-domain eval_steps_per_second: 20.229
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8413917422294617
Out-of-domain eval_accuracy: 0.4011627906976744
Out-of-domain eval_runtime: 3.2724
Out-of-domain eval_samples_per_second: 157.684
Out-of-domain eval_steps_per_second: 19.863
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.5906
In-domain eval_samples_per_second: 158.256
In-domain eval_steps_per_second: 19.877
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.3486
Out-of-domain eval_samples_per_second: 154.096
Out-of-domain eval_steps_per_second: 19.411
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.7496
In-domain eval_samples_per_second: 154.527
In-domain eval_steps_per_second: 19.408
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4177
Out-of-domain eval_samples_per_second: 150.979
Out-of-domain eval_steps_per_second: 19.019
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.8954
In-domain eval_samples_per_second: 151.259
In-domain eval_steps_per_second: 18.998
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4969
Out-of-domain eval_samples_per_second: 147.558
Out-of-domain eval_steps_per_second: 18.588
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 7.0172
In-domain eval_samples_per_second: 148.635
In-domain eval_steps_per_second: 18.668
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.5805
Out-of-domain eval_samples_per_second: 144.114
Out-of-domain eval_steps_per_second: 18.154
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 7.0893
In-domain eval_samples_per_second: 147.123
In-domain eval_steps_per_second: 18.479
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.5289
Out-of-domain eval_samples_per_second: 146.22
Out-of-domain eval_steps_per_second: 18.419
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.9636
In-domain eval_samples_per_second: 149.778
In-domain eval_steps_per_second: 18.812
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4748
Out-of-domain eval_samples_per_second: 148.497
Out-of-domain eval_steps_per_second: 18.706
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.8967
In-domain eval_samples_per_second: 151.233
In-domain eval_steps_per_second: 18.995
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4587
Out-of-domain eval_samples_per_second: 149.187
Out-of-domain eval_steps_per_second: 18.793
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.8509
In-domain eval_samples_per_second: 152.243
In-domain eval_steps_per_second: 19.122
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4583
Out-of-domain eval_samples_per_second: 149.205
Out-of-domain eval_steps_per_second: 18.795
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.727360725402832
In-domain eval_accuracy: 0.49952061361457334
In-domain eval_runtime: 6.8939
In-domain eval_samples_per_second: 151.293
In-domain eval_steps_per_second: 19.002
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7110629677772522
Out-of-domain eval_accuracy: 0.5406976744186046
Out-of-domain eval_runtime: 3.4952
Out-of-domain eval_samples_per_second: 147.633
Out-of-domain eval_steps_per_second: 18.597
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 7.0016
In-domain eval_samples_per_second: 148.966
In-domain eval_steps_per_second: 18.71
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.5236
Out-of-domain eval_samples_per_second: 146.442
Out-of-domain eval_steps_per_second: 18.447
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9511
In-domain eval_samples_per_second: 150.048
In-domain eval_steps_per_second: 18.846
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4984
Out-of-domain eval_samples_per_second: 147.498
Out-of-domain eval_steps_per_second: 18.58
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9256
In-domain eval_samples_per_second: 150.6
In-domain eval_steps_per_second: 18.915
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4882
Out-of-domain eval_samples_per_second: 147.926
Out-of-domain eval_steps_per_second: 18.634
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9248
In-domain eval_samples_per_second: 150.618
In-domain eval_steps_per_second: 18.918
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4745
Out-of-domain eval_samples_per_second: 148.509
Out-of-domain eval_steps_per_second: 18.708
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9304
In-domain eval_samples_per_second: 150.495
In-domain eval_steps_per_second: 18.902
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4927
Out-of-domain eval_samples_per_second: 147.735
Out-of-domain eval_steps_per_second: 18.61
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9287
In-domain eval_samples_per_second: 150.534
In-domain eval_steps_per_second: 18.907
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4696
Out-of-domain eval_samples_per_second: 148.722
Out-of-domain eval_steps_per_second: 18.734
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9235
In-domain eval_samples_per_second: 150.647
In-domain eval_steps_per_second: 18.921
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.4948
Out-of-domain eval_samples_per_second: 147.648
Out-of-domain eval_steps_per_second: 18.599
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.949
In-domain eval_samples_per_second: 150.093
In-domain eval_steps_per_second: 18.852
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.5078
Out-of-domain eval_samples_per_second: 147.102
Out-of-domain eval_steps_per_second: 18.53
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9644
In-domain eval_samples_per_second: 149.761
In-domain eval_steps_per_second: 18.81
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.5183
Out-of-domain eval_samples_per_second: 146.663
Out-of-domain eval_steps_per_second: 18.475
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.7067421674728394
In-domain eval_accuracy: 0.5302013422818792
In-domain eval_runtime: 6.9777
In-domain eval_samples_per_second: 149.476
In-domain eval_steps_per_second: 18.774
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6959239840507507
Out-of-domain eval_accuracy: 0.560077519379845
Out-of-domain eval_runtime: 3.5153
Out-of-domain eval_samples_per_second: 146.788
Out-of-domain eval_steps_per_second: 18.491
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.6934338212013245
In-domain eval_accuracy: 0.5522531160115053
In-domain eval_runtime: 6.9502
In-domain eval_samples_per_second: 150.068
In-domain eval_steps_per_second: 18.848
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6805055141448975
Out-of-domain eval_accuracy: 0.5949612403100775
Out-of-domain eval_runtime: 3.4729
Out-of-domain eval_samples_per_second: 148.58
Out-of-domain eval_steps_per_second: 18.716
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9464
In-domain eval_samples_per_second: 150.149
In-domain eval_steps_per_second: 18.859
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4911
Out-of-domain eval_samples_per_second: 147.803
Out-of-domain eval_steps_per_second: 18.619
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9452
In-domain eval_samples_per_second: 150.175
In-domain eval_steps_per_second: 18.862
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4622
Out-of-domain eval_samples_per_second: 149.039
Out-of-domain eval_steps_per_second: 18.774
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9557
In-domain eval_samples_per_second: 149.95
In-domain eval_steps_per_second: 18.834
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4692
Out-of-domain eval_samples_per_second: 148.738
Out-of-domain eval_steps_per_second: 18.736
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9404
In-domain eval_samples_per_second: 150.279
In-domain eval_steps_per_second: 18.875
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4716
Out-of-domain eval_samples_per_second: 148.636
Out-of-domain eval_steps_per_second: 18.723
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9281
In-domain eval_samples_per_second: 150.547
In-domain eval_steps_per_second: 18.909
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4909
Out-of-domain eval_samples_per_second: 147.815
Out-of-domain eval_steps_per_second: 18.62
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9343
In-domain eval_samples_per_second: 150.412
In-domain eval_steps_per_second: 18.892
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4898
Out-of-domain eval_samples_per_second: 147.861
Out-of-domain eval_steps_per_second: 18.626
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.934
In-domain eval_samples_per_second: 150.419
In-domain eval_steps_per_second: 18.893
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4967
Out-of-domain eval_samples_per_second: 147.566
Out-of-domain eval_steps_per_second: 18.589
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9194
In-domain eval_samples_per_second: 150.736
In-domain eval_steps_per_second: 18.932
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4858
Out-of-domain eval_samples_per_second: 148.03
Out-of-domain eval_steps_per_second: 18.647
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.7187260985374451
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 6.9278
In-domain eval_samples_per_second: 150.554
In-domain eval_steps_per_second: 18.909
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7078965902328491
Out-of-domain eval_accuracy: 0.5290697674418605
Out-of-domain eval_runtime: 3.4985
Out-of-domain eval_samples_per_second: 147.491
Out-of-domain eval_steps_per_second: 18.579
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7570077776908875
In-domain eval_accuracy: 0.5091083413231065
In-domain eval_runtime: 6.9364
In-domain eval_samples_per_second: 150.367
In-domain eval_steps_per_second: 18.886
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7534480690956116
Out-of-domain eval_accuracy: 0.49806201550387597
Out-of-domain eval_runtime: 3.4876
Out-of-domain eval_samples_per_second: 147.954
Out-of-domain eval_steps_per_second: 18.638
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9288
In-domain eval_samples_per_second: 150.531
In-domain eval_steps_per_second: 18.907
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4964
Out-of-domain eval_samples_per_second: 147.579
Out-of-domain eval_steps_per_second: 18.59
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9517
In-domain eval_samples_per_second: 150.035
In-domain eval_steps_per_second: 18.844
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4708
Out-of-domain eval_samples_per_second: 148.667
Out-of-domain eval_steps_per_second: 18.727
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9109
In-domain eval_samples_per_second: 150.922
In-domain eval_steps_per_second: 18.956
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4888
Out-of-domain eval_samples_per_second: 147.901
Out-of-domain eval_steps_per_second: 18.631
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9223
In-domain eval_samples_per_second: 150.672
In-domain eval_steps_per_second: 18.924
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4927
Out-of-domain eval_samples_per_second: 147.737
Out-of-domain eval_steps_per_second: 18.61
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9776
In-domain eval_samples_per_second: 149.478
In-domain eval_steps_per_second: 18.774
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.508
Out-of-domain eval_samples_per_second: 147.091
Out-of-domain eval_steps_per_second: 18.529
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9224
In-domain eval_samples_per_second: 150.67
In-domain eval_steps_per_second: 18.924
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4989
Out-of-domain eval_samples_per_second: 147.474
Out-of-domain eval_steps_per_second: 18.577
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9722
In-domain eval_samples_per_second: 149.595
In-domain eval_steps_per_second: 18.789
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.5187
Out-of-domain eval_samples_per_second: 146.646
Out-of-domain eval_steps_per_second: 18.473
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9407
In-domain eval_samples_per_second: 150.273
In-domain eval_steps_per_second: 18.874
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4967
Out-of-domain eval_samples_per_second: 147.569
Out-of-domain eval_steps_per_second: 18.589
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 0.7031458616256714
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 6.9418
In-domain eval_samples_per_second: 150.25
In-domain eval_steps_per_second: 18.871
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6854815483093262
Out-of-domain eval_accuracy: 0.5775193798449613
Out-of-domain eval_runtime: 3.4864
Out-of-domain eval_samples_per_second: 148.004
Out-of-domain eval_steps_per_second: 18.644
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7339271903038025
In-domain eval_accuracy: 0.5838926174496645
In-domain eval_runtime: 7.0086
In-domain eval_samples_per_second: 148.818
In-domain eval_steps_per_second: 18.691
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.6999451518058777
Out-of-domain eval_accuracy: 0.5968992248062015
Out-of-domain eval_runtime: 3.5446
Out-of-domain eval_samples_per_second: 145.575
Out-of-domain eval_steps_per_second: 18.338
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.9932
In-domain eval_samples_per_second: 149.146
In-domain eval_steps_per_second: 18.733
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5506
Out-of-domain eval_samples_per_second: 145.327
Out-of-domain eval_steps_per_second: 18.307
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.983
In-domain eval_samples_per_second: 149.362
In-domain eval_steps_per_second: 18.76
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5144
Out-of-domain eval_samples_per_second: 146.825
Out-of-domain eval_steps_per_second: 18.495
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.9271
In-domain eval_samples_per_second: 150.568
In-domain eval_steps_per_second: 18.911
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.474
Out-of-domain eval_samples_per_second: 148.534
Out-of-domain eval_steps_per_second: 18.711
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 7.0064
In-domain eval_samples_per_second: 148.864
In-domain eval_steps_per_second: 18.697
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5373
Out-of-domain eval_samples_per_second: 145.875
Out-of-domain eval_steps_per_second: 18.376
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.9974
In-domain eval_samples_per_second: 149.055
In-domain eval_steps_per_second: 18.721
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5417
Out-of-domain eval_samples_per_second: 145.693
Out-of-domain eval_steps_per_second: 18.353
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.9979
In-domain eval_samples_per_second: 149.044
In-domain eval_steps_per_second: 18.72
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5251
Out-of-domain eval_samples_per_second: 146.377
Out-of-domain eval_steps_per_second: 18.439
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 7.0087
In-domain eval_samples_per_second: 148.815
In-domain eval_steps_per_second: 18.691
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.542
Out-of-domain eval_samples_per_second: 145.679
Out-of-domain eval_steps_per_second: 18.351
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 6.9927
In-domain eval_samples_per_second: 149.156
In-domain eval_steps_per_second: 18.734
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5279
Out-of-domain eval_samples_per_second: 146.261
Out-of-domain eval_steps_per_second: 18.424
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 0.7503437399864197
In-domain eval_accuracy: 0.5714285714285714
In-domain eval_runtime: 7.0033
In-domain eval_samples_per_second: 148.929
In-domain eval_steps_per_second: 18.705
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 0.735813319683075
Out-of-domain eval_accuracy: 0.5833333333333334
Out-of-domain eval_runtime: 3.5007
Out-of-domain eval_samples_per_second: 147.397
Out-of-domain eval_steps_per_second: 18.567
Out-of-domain epoch: 40.0


In [7]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in few_shot_sample_size:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.4879194630872483
Maximum in-domain accuracy: 0.49952061361457334
Average out-of-domain accuracy: 0.5267441860465116
Maximum out-of-domain accuracy: 0.5406976744186046


For n=16:
Average in-domain accuracy: 0.5302013422818792
Maximum in-domain accuracy: 0.5302013422818792
Average out-of-domain accuracy: 0.560077519379845
Maximum out-of-domain accuracy: 0.560077519379845


For n=32:
Average in-domain accuracy: 0.5298178331735379
Maximum in-domain accuracy: 0.5522531160115053
Average out-of-domain accuracy: 0.5356589147286822
Maximum out-of-domain accuracy: 0.5949612403100775


For n=64:
Average in-domain accuracy: 0.5565675934803452
Maximum in-domain accuracy: 0.5618408437200384
Average out-of-domain accuracy: 0.5695736434108527
Maximum out-of-domain accuracy: 0.5775193798449613


For n=128:
Average in-domain accuracy: 0.5726749760306806
Maximum in-domain accuracy: 0.5838926174496645
Average out-of-domain accuracy: 0.5846899224806201
Maximum out-of

In [8]:
# Save the DataFrame to a CSV file
curr_filename = "vanilla_cola_adaptive_125M_2Layer"
results_df.to_csv(f'../Results/{curr_filename}.csv', sep = ',', index=False)

In [None]:
# disconnect runtime
from google.colab import runtime
runtime.unassign()