# **Few shot Fine Tuning on MNLI**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/DL_project/llm_finetuning/notebooks/

/content/drive/MyDrive/DL_project/llm_finetuning/notebooks


In [None]:
!pip install -q transformers accelerate bitsandbytes datasets

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [None]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

data = load_dataset("glue", "mnli")
hans_data = load_dataset("hans")
#print(hans_data.keys())

#Below function is taken from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)

#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
#!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-350m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer([f'{premise} Question: {hypothesis} Yes or No?' for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])], truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)
hans_data = hans_data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for n in n_values:
    for run in range(10):  # repeat 10 times for each n

        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Re-initialize the model at the start of each training cycle
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-350m", config=config)
        model.to(device)  # Move the model to the device (gpu if available)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./offload_folder",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance using HANS dataset
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=hans_data["validation"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.9756306409835815
In-domain eval_accuracy: 0.5312313209802749
In-domain eval_runtime: 155.4924
In-domain eval_samples_per_second: 43.037
In-domain eval_steps_per_second: 5.383
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.2422854900360107
Out-of-domain eval_accuracy: 0.5000333333333333
Out-of-domain eval_runtime: 682.1642
Out-of-domain eval_samples_per_second: 43.978
Out-of-domain eval_steps_per_second: 5.497
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 155.0631
In-domain eval_samples_per_second: 43.157
In-domain eval_steps_per_second: 5.398
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 682.4437
Out-of-domain eval_samples_per_second: 43.96
Out-of-domain eval_steps_per_second: 5.495
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 155.338
In-domain eval_samples_per_second: 43.08
In-domain eval_steps_per_second: 5.388
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 683.2365
Out-of-domain eval_samples_per_second: 43.909
Out-of-domain eval_steps_per_second: 5.489
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.5597
In-domain eval_samples_per_second: 43.297
In-domain eval_steps_per_second: 5.415
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.7487
Out-of-domain eval_samples_per_second: 44.004
Out-of-domain eval_steps_per_second: 5.501
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 155.0635
In-domain eval_samples_per_second: 43.157
In-domain eval_steps_per_second: 5.398
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.8916
Out-of-domain eval_samples_per_second: 43.995
Out-of-domain eval_steps_per_second: 5.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.8985
In-domain eval_samples_per_second: 43.202
In-domain eval_steps_per_second: 5.404
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.5403
Out-of-domain eval_samples_per_second: 44.018
Out-of-domain eval_steps_per_second: 5.502
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.4273
In-domain eval_samples_per_second: 43.334
In-domain eval_steps_per_second: 5.42
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.3772
Out-of-domain eval_samples_per_second: 44.028
Out-of-domain eval_steps_per_second: 5.504
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.542
In-domain eval_samples_per_second: 43.302
In-domain eval_steps_per_second: 5.416
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.7692
Out-of-domain eval_samples_per_second: 44.003
Out-of-domain eval_steps_per_second: 5.5
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.5532
In-domain eval_samples_per_second: 43.299
In-domain eval_steps_per_second: 5.416
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.5787
Out-of-domain eval_samples_per_second: 44.015
Out-of-domain eval_steps_per_second: 5.502
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.033245325088501
In-domain eval_accuracy: 0.5156903765690377
In-domain eval_runtime: 154.7481
In-domain eval_samples_per_second: 43.244
In-domain eval_steps_per_second: 5.409
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3191864490509033
Out-of-domain eval_accuracy: 0.49496666666666667
Out-of-domain eval_runtime: 681.9866
Out-of-domain eval_samples_per_second: 43.989
Out-of-domain eval_steps_per_second: 5.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.6043
In-domain eval_samples_per_second: 43.285
In-domain eval_steps_per_second: 5.414
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 681.8735
Out-of-domain eval_samples_per_second: 43.996
Out-of-domain eval_steps_per_second: 5.5
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.6713
In-domain eval_samples_per_second: 43.266
In-domain eval_steps_per_second: 5.411
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.3067
Out-of-domain eval_samples_per_second: 43.968
Out-of-domain eval_steps_per_second: 5.496
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 155.3803
In-domain eval_samples_per_second: 43.069
In-domain eval_steps_per_second: 5.387
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 681.9484
Out-of-domain eval_samples_per_second: 43.992
Out-of-domain eval_steps_per_second: 5.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.5849
In-domain eval_samples_per_second: 43.29
In-domain eval_steps_per_second: 5.414
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.4266
Out-of-domain eval_samples_per_second: 43.961
Out-of-domain eval_steps_per_second: 5.495
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.7335
In-domain eval_samples_per_second: 43.249
In-domain eval_steps_per_second: 5.409
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.7206
Out-of-domain eval_samples_per_second: 43.942
Out-of-domain eval_steps_per_second: 5.493
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.3571
In-domain eval_samples_per_second: 43.354
In-domain eval_steps_per_second: 5.422
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.012
Out-of-domain eval_samples_per_second: 43.987
Out-of-domain eval_steps_per_second: 5.498
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.7828
In-domain eval_samples_per_second: 43.235
In-domain eval_steps_per_second: 5.408
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.2772
Out-of-domain eval_samples_per_second: 43.97
Out-of-domain eval_steps_per_second: 5.496
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.8069
In-domain eval_samples_per_second: 43.228
In-domain eval_steps_per_second: 5.407
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.6333
Out-of-domain eval_samples_per_second: 43.947
Out-of-domain eval_steps_per_second: 5.493
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 155.34
In-domain eval_samples_per_second: 43.08
In-domain eval_steps_per_second: 5.388
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.4967
Out-of-domain eval_samples_per_second: 43.956
Out-of-domain eval_steps_per_second: 5.495
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 1.8936339616775513
In-domain eval_accuracy: 0.5499103407053197
In-domain eval_runtime: 154.6476
In-domain eval_samples_per_second: 43.273
In-domain eval_steps_per_second: 5.412
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.900935649871826
Out-of-domain eval_accuracy: 0.4902666666666667
Out-of-domain eval_runtime: 682.9327
Out-of-domain eval_samples_per_second: 43.928
Out-of-domain eval_steps_per_second: 5.491
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.9510600566864014
In-domain eval_accuracy: 0.5431858936043037
In-domain eval_runtime: 155.3952
In-domain eval_samples_per_second: 43.064
In-domain eval_steps_per_second: 5.386
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 4.548200607299805
Out-of-domain eval_accuracy: 0.49846666666666667
Out-of-domain eval_runtime: 682.9569
Out-of-domain eval_samples_per_second: 43.927
Out-of-domain eval_steps_per_second: 5.491
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.1028
In-domain eval_samples_per_second: 43.146
In-domain eval_steps_per_second: 5.396
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 682.1235
Out-of-domain eval_samples_per_second: 43.98
Out-of-domain eval_steps_per_second: 5.498
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.1284
In-domain eval_samples_per_second: 43.138
In-domain eval_steps_per_second: 5.396
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 682.2721
Out-of-domain eval_samples_per_second: 43.971
Out-of-domain eval_steps_per_second: 5.496
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.1283
In-domain eval_samples_per_second: 43.138
In-domain eval_steps_per_second: 5.396
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 683.1157
Out-of-domain eval_samples_per_second: 43.916
Out-of-domain eval_steps_per_second: 5.49
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 154.7812
In-domain eval_samples_per_second: 43.235
In-domain eval_steps_per_second: 5.408
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 683.4119
Out-of-domain eval_samples_per_second: 43.897
Out-of-domain eval_steps_per_second: 5.487
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 154.7039
In-domain eval_samples_per_second: 43.257
In-domain eval_steps_per_second: 5.41
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 684.7292
Out-of-domain eval_samples_per_second: 43.813
Out-of-domain eval_steps_per_second: 5.477
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.6208
In-domain eval_samples_per_second: 43.002
In-domain eval_steps_per_second: 5.378
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 683.5059
Out-of-domain eval_samples_per_second: 43.891
Out-of-domain eval_steps_per_second: 5.486
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.5141
In-domain eval_samples_per_second: 43.031
In-domain eval_steps_per_second: 5.382
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 683.1362
Out-of-domain eval_samples_per_second: 43.915
Out-of-domain eval_steps_per_second: 5.489
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 155.1958
In-domain eval_samples_per_second: 43.12
In-domain eval_steps_per_second: 5.393
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 682.4112
Out-of-domain eval_samples_per_second: 43.962
Out-of-domain eval_steps_per_second: 5.495
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 2.279772996902466
In-domain eval_accuracy: 0.5364614465032875
In-domain eval_runtime: 154.7253
In-domain eval_samples_per_second: 43.251
In-domain eval_steps_per_second: 5.41
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.0191891193389893
Out-of-domain eval_accuracy: 0.47756666666666664
Out-of-domain eval_runtime: 683.8276
Out-of-domain eval_samples_per_second: 43.871
Out-of-domain eval_steps_per_second: 5.484
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.2420523166656494
In-domain eval_accuracy: 0.5712791392707711
In-domain eval_runtime: 154.563
In-domain eval_samples_per_second: 43.296
In-domain eval_steps_per_second: 5.415
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.8924472332000732
Out-of-domain eval_accuracy: 0.48923333333333335
Out-of-domain eval_runtime: 682.1373
Out-of-domain eval_samples_per_second: 43.979
Out-of-domain eval_steps_per_second: 5.497
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.8198
In-domain eval_samples_per_second: 43.224
In-domain eval_steps_per_second: 5.406
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.423
Out-of-domain eval_samples_per_second: 43.961
Out-of-domain eval_steps_per_second: 5.495
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.5199
In-domain eval_samples_per_second: 43.308
In-domain eval_steps_per_second: 5.417
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 681.8909
Out-of-domain eval_samples_per_second: 43.995
Out-of-domain eval_steps_per_second: 5.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.4928
In-domain eval_samples_per_second: 43.316
In-domain eval_steps_per_second: 5.418
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.5959
Out-of-domain eval_samples_per_second: 43.95
Out-of-domain eval_steps_per_second: 5.494
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.661
In-domain eval_samples_per_second: 43.269
In-domain eval_steps_per_second: 5.412
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.1533
Out-of-domain eval_samples_per_second: 43.978
Out-of-domain eval_steps_per_second: 5.497
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.3462
In-domain eval_samples_per_second: 43.357
In-domain eval_steps_per_second: 5.423
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.1847
Out-of-domain eval_samples_per_second: 43.976
Out-of-domain eval_steps_per_second: 5.497
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.7868
In-domain eval_samples_per_second: 43.234
In-domain eval_steps_per_second: 5.407
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.616
Out-of-domain eval_samples_per_second: 43.949
Out-of-domain eval_steps_per_second: 5.494
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.7348
In-domain eval_samples_per_second: 43.248
In-domain eval_steps_per_second: 5.409
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 681.9536
Out-of-domain eval_samples_per_second: 43.991
Out-of-domain eval_steps_per_second: 5.499
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.4777
In-domain eval_samples_per_second: 43.32
In-domain eval_steps_per_second: 5.418
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 681.8047
Out-of-domain eval_samples_per_second: 44.001
Out-of-domain eval_steps_per_second: 5.5
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.890877366065979
In-domain eval_accuracy: 0.5877166766288106
In-domain eval_runtime: 154.5634
In-domain eval_samples_per_second: 43.296
In-domain eval_steps_per_second: 5.415
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.306594371795654
Out-of-domain eval_accuracy: 0.5009333333333333
Out-of-domain eval_runtime: 682.085
Out-of-domain eval_samples_per_second: 43.983
Out-of-domain eval_steps_per_second: 5.498
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.9489855766296387
In-domain eval_accuracy: 0.673789599521817
In-domain eval_runtime: 154.6812
In-domain eval_samples_per_second: 43.263
In-domain eval_steps_per_second: 5.411
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...


Out-of-domain eval_loss: 7.001240253448486
Out-of-domain eval_accuracy: 0.5
Out-of-domain eval_runtime: 682.0984
Out-of-domain eval_samples_per_second: 43.982
Out-of-domain eval_steps_per_second: 5.498
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.4766
In-domain eval_samples_per_second: 43.32
In-domain eval_steps_per_second: 5.418
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.4517
Out-of-domain eval_samples_per_second: 44.024
Out-of-domain eval_steps_per_second: 5.503
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.3934
In-domain eval_samples_per_second: 43.344
In-domain eval_steps_per_second: 5.421
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.5086
Out-of-domain eval_samples_per_second: 44.02
Out-of-domain eval_steps_per_second: 5.502
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.8622
In-domain eval_samples_per_second: 43.213
In-domain eval_steps_per_second: 5.405
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.8339
Out-of-domain eval_samples_per_second: 43.999
Out-of-domain eval_steps_per_second: 5.5
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.7807
In-domain eval_samples_per_second: 43.235
In-domain eval_steps_per_second: 5.408
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.7576
Out-of-domain eval_samples_per_second: 44.004
Out-of-domain eval_steps_per_second: 5.5
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 155.3474
In-domain eval_samples_per_second: 43.078
In-domain eval_steps_per_second: 5.388
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.6299
Out-of-domain eval_samples_per_second: 44.012
Out-of-domain eval_steps_per_second: 5.502
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 155.0657
In-domain eval_samples_per_second: 43.156
In-domain eval_steps_per_second: 5.398
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.641
Out-of-domain eval_samples_per_second: 44.011
Out-of-domain eval_steps_per_second: 5.501
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.8629
In-domain eval_samples_per_second: 43.212
In-domain eval_steps_per_second: 5.405
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 682.0091
Out-of-domain eval_samples_per_second: 43.988
Out-of-domain eval_steps_per_second: 5.498
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8812224864959717
In-domain eval_accuracy: 0.6217872086072923
In-domain eval_runtime: 154.2187
In-domain eval_samples_per_second: 43.393
In-domain eval_steps_per_second: 5.427
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.2424116134643555
Out-of-domain eval_accuracy: 0.5152333333333333
Out-of-domain eval_runtime: 681.5412
Out-of-domain eval_samples_per_second: 44.018
Out-of-domain eval_steps_per_second: 5.502
Out-of-domain epoch: 40.0


ReadTimeout: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)

In [None]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv("../Results/pbft_mnli_baseline_hansOOD_350m.csv", index=False)