# Few shot Fine Tuning on Cola Data Set - Baseline

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/LLM/llm_finetuning/notebooks'
!ls

/content/drive/MyDrive/LLM/llm_finetuning/notebooks
pbft_cola_baseline.ipynb  pre_trained_opt_with_inference.ipynb	vanilla_cola_baseline.ipynb
pbft_mnli_baseline.ipynb  results				vanilla_mnli_baseline.ipynb


In [3]:
curr_filename = "vanilla_cola_baseline"

In [4]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# Version 2 04/19/2024

from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig
import numpy as np
import pandas as pd
import torch

# Set seed, load COLA dataset

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

in_domain_data = load_dataset("glue", "cola")


# Define model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)

# Function to load and parse out-of-domain COLA dataset
'''
\cite: https://github.com/uds-lsv/llmft/blob/main/task_utils.py
'''
def load_cola_ood_dataset(path, label=None, cache_dir=None):
    data_files = {"validation": path}
    dataset = load_dataset("csv", data_files=data_files, sep="\t", column_names=[
                           'code', 'label', 'annotation', 'sentence'], cache_dir=cache_dir)
    dataset = dataset["validation"]

    # cola-ood comes without indices, so we add them
    indices = list(range(len(dataset)))
    dataset = dataset.add_column(name="idx", column=indices)

    subset = "cola-ood"

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'acceptable' if label == 1 else 'unacceptable'}"

    return dataset, subset


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


def manipulate_inputs_for_cola_with_prompt(inputs):
    # Add pattern/prompts
    #inputs = tokenizer(["Yes or No?" + sentence for sentence in inputs["sentence"]], truncation=True, padding="max_length", max_length=128)
    inputs = tokenizer(inputs["sentence"], truncation=True, padding="max_length", max_length=128)

    return inputs

# augment in-domain-data
in_domain_data = in_domain_data.map(manipulate_inputs_for_cola_with_prompt, batched=True)

# augment out of domain data
eval_ood_data, _ = load_cola_ood_dataset(path='../datafiles/dev.tsv')
eval_ood_data = eval_ood_data.map(manipulate_inputs_for_cola_with_prompt, batched=True)


# Define parameters for training experiments (per reference paper)

few_shot_sample_size = [2, 16, 32, 64, 128]  # number of examples for each class
num_epochs = 40
batch_size = 32
learning_rate = 1e-5
weight_decay = 0.
warmup_ratio = 0.1
num_runs = 10
optimizer = AdamW(model.parameters(), lr=learning_rate) # AdamW optimizer



results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])

for n in few_shot_sample_size:
    for run_idx in range(num_runs):  # repeat 10 times for each n
        # re-iniialize model for each run
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        optimizer = AdamW(model.parameters(), lr=learning_rate)

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(in_domain_data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(in_domain_data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = in_domain_data["train"].select(indices)

        # Define training config
        # Total steps = (#samples/batch size) * epochs
        total_steps = (len(train_dataset) // batch_size) * num_epochs

        training_args = TrainingArguments(
            output_dir = "./results",
            overwrite_output_dir = True,
            num_train_epochs = num_epochs,
            per_device_train_batch_size = batch_size,
            learning_rate = learning_rate,
            weight_decay = weight_decay,
            save_steps = 10_000,
            save_total_limit = 2,
            warmup_steps = int(warmup_ratio * total_steps),
        )

        # Define the trainer
        trainer = Trainer(
            model = model,
            args=training_args,
            train_dataset = train_dataset,
            compute_metrics = compute_metrics,
            optimizers=(optimizer, None),
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=in_domain_data["validation"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=eval_ood_data)

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run_idx],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/516 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.7734178304672241
In-domain eval_accuracy: 0.6097794822627037
In-domain eval_runtime: 6.7335
In-domain eval_samples_per_second: 154.898
In-domain eval_steps_per_second: 19.455
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.7730079889297485
Out-of-domain eval_accuracy: 0.6124031007751938
Out-of-domain eval_runtime: 3.4078
Out-of-domain eval_samples_per_second: 151.417
Out-of-domain eval_steps_per_second: 19.074
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 6.9108
In-domain eval_samples_per_second: 150.922
In-domain eval_steps_per_second: 18.956
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.5046
Out-of-domain eval_samples_per_second: 147.234
Out-of-domain eval_steps_per_second: 18.547
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.1244
In-domain eval_samples_per_second: 146.397
In-domain eval_steps_per_second: 18.387
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.5992
Out-of-domain eval_samples_per_second: 143.364
Out-of-domain eval_steps_per_second: 18.059
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.3561
In-domain eval_samples_per_second: 141.787
In-domain eval_steps_per_second: 17.808
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.7334
Out-of-domain eval_samples_per_second: 138.211
Out-of-domain eval_steps_per_second: 17.41
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.4905
In-domain eval_samples_per_second: 139.243
In-domain eval_steps_per_second: 17.489
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6989
Out-of-domain eval_samples_per_second: 139.5
Out-of-domain eval_steps_per_second: 17.573
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.2959
In-domain eval_samples_per_second: 142.957
In-domain eval_steps_per_second: 17.955
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6367
Out-of-domain eval_samples_per_second: 141.889
Out-of-domain eval_steps_per_second: 17.874
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.1941
In-domain eval_samples_per_second: 144.98
In-domain eval_steps_per_second: 18.209
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6118
Out-of-domain eval_samples_per_second: 142.864
Out-of-domain eval_steps_per_second: 17.996
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.207
In-domain eval_samples_per_second: 144.719
In-domain eval_steps_per_second: 18.177
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6222
Out-of-domain eval_samples_per_second: 142.453
Out-of-domain eval_steps_per_second: 17.945
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.2606
In-domain eval_samples_per_second: 143.652
In-domain eval_steps_per_second: 18.043
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6431
Out-of-domain eval_samples_per_second: 141.637
Out-of-domain eval_steps_per_second: 17.842
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 0.8257512450218201
In-domain eval_accuracy: 0.5225311601150527
In-domain eval_runtime: 7.3046
In-domain eval_samples_per_second: 142.787
In-domain eval_steps_per_second: 17.934
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8057412505149841
Out-of-domain eval_accuracy: 0.5465116279069767
Out-of-domain eval_runtime: 3.6684
Out-of-domain eval_samples_per_second: 140.663
Out-of-domain eval_steps_per_second: 17.719
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2767
In-domain eval_samples_per_second: 143.334
In-domain eval_steps_per_second: 18.003
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6426
Out-of-domain eval_samples_per_second: 141.656
Out-of-domain eval_steps_per_second: 17.844
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2695
In-domain eval_samples_per_second: 143.477
In-domain eval_steps_per_second: 18.021
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6326
Out-of-domain eval_samples_per_second: 142.046
Out-of-domain eval_steps_per_second: 17.893
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2727
In-domain eval_samples_per_second: 143.412
In-domain eval_steps_per_second: 18.012
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6409
Out-of-domain eval_samples_per_second: 141.722
Out-of-domain eval_steps_per_second: 17.853
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2929
In-domain eval_samples_per_second: 143.015
In-domain eval_steps_per_second: 17.963
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6601
Out-of-domain eval_samples_per_second: 140.98
Out-of-domain eval_steps_per_second: 17.759
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2949
In-domain eval_samples_per_second: 142.977
In-domain eval_steps_per_second: 17.958
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6587
Out-of-domain eval_samples_per_second: 141.032
Out-of-domain eval_steps_per_second: 17.766
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2933
In-domain eval_samples_per_second: 143.007
In-domain eval_steps_per_second: 17.962
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6746
Out-of-domain eval_samples_per_second: 140.425
Out-of-domain eval_steps_per_second: 17.689
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2874
In-domain eval_samples_per_second: 143.123
In-domain eval_steps_per_second: 17.976
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6648
Out-of-domain eval_samples_per_second: 140.798
Out-of-domain eval_steps_per_second: 17.736
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2887
In-domain eval_samples_per_second: 143.098
In-domain eval_steps_per_second: 17.973
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6469
Out-of-domain eval_samples_per_second: 141.488
Out-of-domain eval_steps_per_second: 17.823
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2806
In-domain eval_samples_per_second: 143.257
In-domain eval_steps_per_second: 17.993
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6329
Out-of-domain eval_samples_per_second: 142.035
Out-of-domain eval_steps_per_second: 17.892
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8661156892776489
In-domain eval_accuracy: 0.5273250239693192
In-domain eval_runtime: 7.2913
In-domain eval_samples_per_second: 143.047
In-domain eval_steps_per_second: 17.967
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.8563407063484192
Out-of-domain eval_accuracy: 0.5310077519379846
Out-of-domain eval_runtime: 3.6316
Out-of-domain eval_samples_per_second: 142.087
Out-of-domain eval_steps_per_second: 17.899
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 1.0942018032073975
In-domain eval_accuracy: 0.573346116970278
In-domain eval_runtime: 7.3152
In-domain eval_samples_per_second: 142.579
In-domain eval_steps_per_second: 17.908
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.0560842752456665
Out-of-domain eval_accuracy: 0.5852713178294574
Out-of-domain eval_runtime: 3.6762
Out-of-domain eval_samples_per_second: 140.363
Out-of-domain eval_steps_per_second: 17.681
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2838
In-domain eval_samples_per_second: 143.194
In-domain eval_steps_per_second: 17.985
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.645
Out-of-domain eval_samples_per_second: 141.564
Out-of-domain eval_steps_per_second: 17.833
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2971
In-domain eval_samples_per_second: 142.933
In-domain eval_steps_per_second: 17.952
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6535
Out-of-domain eval_samples_per_second: 141.233
Out-of-domain eval_steps_per_second: 17.791
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.3052
In-domain eval_samples_per_second: 142.776
In-domain eval_steps_per_second: 17.933
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6589
Out-of-domain eval_samples_per_second: 141.026
Out-of-domain eval_steps_per_second: 17.765
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2893
In-domain eval_samples_per_second: 143.086
In-domain eval_steps_per_second: 17.971
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6685
Out-of-domain eval_samples_per_second: 140.656
Out-of-domain eval_steps_per_second: 17.718
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2897
In-domain eval_samples_per_second: 143.078
In-domain eval_steps_per_second: 17.97
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6566
Out-of-domain eval_samples_per_second: 141.116
Out-of-domain eval_steps_per_second: 17.776
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2639
In-domain eval_samples_per_second: 143.586
In-domain eval_steps_per_second: 18.034
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6551
Out-of-domain eval_samples_per_second: 141.171
Out-of-domain eval_steps_per_second: 17.783
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2708
In-domain eval_samples_per_second: 143.45
In-domain eval_steps_per_second: 18.017
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6596
Out-of-domain eval_samples_per_second: 140.999
Out-of-domain eval_steps_per_second: 17.761
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.3009
In-domain eval_samples_per_second: 142.858
In-domain eval_steps_per_second: 17.943
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.6358
Out-of-domain eval_samples_per_second: 141.92
Out-of-domain eval_steps_per_second: 17.878
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9714317917823792
In-domain eval_accuracy: 0.5618408437200384
In-domain eval_runtime: 7.2888
In-domain eval_samples_per_second: 143.095
In-domain eval_steps_per_second: 17.973
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 0.9183535575866699
Out-of-domain eval_accuracy: 0.5484496124031008
Out-of-domain eval_runtime: 3.636
Out-of-domain eval_samples_per_second: 141.912
Out-of-domain eval_steps_per_second: 17.877
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.4793576002120972
In-domain eval_accuracy: 0.5944391179290508
In-domain eval_runtime: 7.2881
In-domain eval_samples_per_second: 143.11
In-domain eval_steps_per_second: 17.974
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.3956272602081299
Out-of-domain eval_accuracy: 0.5910852713178295
Out-of-domain eval_runtime: 3.6396
Out-of-domain eval_samples_per_second: 141.772
Out-of-domain eval_steps_per_second: 17.859
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.2791
In-domain eval_samples_per_second: 143.288
In-domain eval_steps_per_second: 17.997
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6481
Out-of-domain eval_samples_per_second: 141.444
Out-of-domain eval_steps_per_second: 17.818
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.286
In-domain eval_samples_per_second: 143.151
In-domain eval_steps_per_second: 17.98
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6633
Out-of-domain eval_samples_per_second: 140.855
Out-of-domain eval_steps_per_second: 17.743
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.2965
In-domain eval_samples_per_second: 142.945
In-domain eval_steps_per_second: 17.954
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6613
Out-of-domain eval_samples_per_second: 140.934
Out-of-domain eval_steps_per_second: 17.753
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.2903
In-domain eval_samples_per_second: 143.066
In-domain eval_steps_per_second: 17.969
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6644
Out-of-domain eval_samples_per_second: 140.815
Out-of-domain eval_steps_per_second: 17.738
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.268
In-domain eval_samples_per_second: 143.506
In-domain eval_steps_per_second: 18.024
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.631
Out-of-domain eval_samples_per_second: 142.109
Out-of-domain eval_steps_per_second: 17.901
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.3336
In-domain eval_samples_per_second: 142.222
In-domain eval_steps_per_second: 17.863
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6683
Out-of-domain eval_samples_per_second: 140.664
Out-of-domain eval_steps_per_second: 17.719
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.3255
In-domain eval_samples_per_second: 142.38
In-domain eval_steps_per_second: 17.883
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.7022
Out-of-domain eval_samples_per_second: 139.376
Out-of-domain eval_steps_per_second: 17.557
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.2871
In-domain eval_samples_per_second: 143.13
In-domain eval_steps_per_second: 17.977
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.658
Out-of-domain eval_samples_per_second: 141.061
Out-of-domain eval_steps_per_second: 17.769
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.94387686252594
In-domain eval_accuracy: 0.588686481303931
In-domain eval_runtime: 7.3353
In-domain eval_samples_per_second: 142.189
In-domain eval_steps_per_second: 17.859
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 1.8603342771530151
Out-of-domain eval_accuracy: 0.5717054263565892
Out-of-domain eval_runtime: 3.6809
Out-of-domain eval_samples_per_second: 140.182
Out-of-domain eval_steps_per_second: 17.659
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.445613145828247
In-domain eval_accuracy: 0.6308724832214765
In-domain eval_runtime: 7.333
In-domain eval_samples_per_second: 142.234
In-domain eval_steps_per_second: 17.864
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.3669300079345703
Out-of-domain eval_accuracy: 0.624031007751938
Out-of-domain eval_runtime: 3.69
Out-of-domain eval_samples_per_second: 139.836
Out-of-domain eval_steps_per_second: 17.615
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.3165
In-domain eval_samples_per_second: 142.555
In-domain eval_steps_per_second: 17.905
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.672
Out-of-domain eval_samples_per_second: 140.523
Out-of-domain eval_steps_per_second: 17.701
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.3207
In-domain eval_samples_per_second: 142.472
In-domain eval_steps_per_second: 17.894
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6765
Out-of-domain eval_samples_per_second: 140.35
Out-of-domain eval_steps_per_second: 17.68
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.3075
In-domain eval_samples_per_second: 142.729
In-domain eval_steps_per_second: 17.927
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6666
Out-of-domain eval_samples_per_second: 140.729
Out-of-domain eval_steps_per_second: 17.727
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.3224
In-domain eval_samples_per_second: 142.439
In-domain eval_steps_per_second: 17.89
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6617
Out-of-domain eval_samples_per_second: 140.918
Out-of-domain eval_steps_per_second: 17.751
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.2905
In-domain eval_samples_per_second: 143.064
In-domain eval_steps_per_second: 17.969
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6543
Out-of-domain eval_samples_per_second: 141.203
Out-of-domain eval_steps_per_second: 17.787
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.3189
In-domain eval_samples_per_second: 142.508
In-domain eval_steps_per_second: 17.899
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6783
Out-of-domain eval_samples_per_second: 140.281
Out-of-domain eval_steps_per_second: 17.671
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.2728
In-domain eval_samples_per_second: 143.411
In-domain eval_steps_per_second: 18.012
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6602
Out-of-domain eval_samples_per_second: 140.976
Out-of-domain eval_steps_per_second: 17.759
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.2719
In-domain eval_samples_per_second: 143.429
In-domain eval_steps_per_second: 18.015
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.647
Out-of-domain eval_samples_per_second: 141.486
Out-of-domain eval_steps_per_second: 17.823
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 2.298306941986084
In-domain eval_accuracy: 0.6279961649089166
In-domain eval_runtime: 7.2979
In-domain eval_samples_per_second: 142.918
In-domain eval_steps_per_second: 17.95
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.285783052444458
Out-of-domain eval_accuracy: 0.6201550387596899
Out-of-domain eval_runtime: 3.6493
Out-of-domain eval_samples_per_second: 141.398
Out-of-domain eval_steps_per_second: 17.812
Out-of-domain epoch: 40.0


In [6]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in few_shot_sample_size:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.5312559923298179
Maximum in-domain accuracy: 0.6097794822627037
Average out-of-domain accuracy: 0.5531007751937984
Maximum out-of-domain accuracy: 0.6124031007751938


For n=16:
Average in-domain accuracy: 0.5273250239693192
Maximum in-domain accuracy: 0.5273250239693192
Average out-of-domain accuracy: 0.5310077519379846
Maximum out-of-domain accuracy: 0.5310077519379846


For n=32:
Average in-domain accuracy: 0.5629913710450624
Maximum in-domain accuracy: 0.573346116970278
Average out-of-domain accuracy: 0.5521317829457364
Maximum out-of-domain accuracy: 0.5852713178294574


For n=64:
Average in-domain accuracy: 0.589261744966443
Maximum in-domain accuracy: 0.5944391179290508
Average out-of-domain accuracy: 0.5736434108527132
Maximum out-of-domain accuracy: 0.5910852713178295


For n=128:
Average in-domain accuracy: 0.6282837967401727
Maximum in-domain accuracy: 0.6308724832214765
Average out-of-domain accuracy: 0.6205426356589147
Maximum out-of-

In [7]:
# Save the DataFrame to a CSV file
curr_filename = "vanilla_cola_baseline_125M"
results_df.to_csv(f'../Results/{curr_filename}.csv', sep = ',', index=False)

In [None]:
# disconnect runtime
from google.colab import runtime
runtime.unassign()