# **Few shot Fine Tuning on MNLI**

In [None]:
!pip install -q transformers accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [None]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

np.random.seed(42)  # for reproducibility

data = load_dataset("glue", "mnli")

def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)


#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer([f'{premise} Question: {hypothesis} Yes or No?' for premise, hypothesis in zip(batch["premise"], batch["hypothesis"])], truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])

for n in n_values:
    for run in range(10):  # repeat 10 times for each n
        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./results",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        #trainer.args.eval_dataset = data["validation_matched"]  # in-domain evaluation
        #eval_results = trainer.evaluate()
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        #trainer.args.eval_dataset = data["validation_mismatched"]  # out-of-domain evaluation
        #eval_results = trainer.evaluate()
        eval_results = trainer.evaluate(eval_dataset=data["validation_mismatched"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/261802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9847 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.7653623819351196
In-domain eval_accuracy: 0.5055289898386133
In-domain eval_runtime: 42.6982
In-domain eval_samples_per_second: 156.728
In-domain eval_steps_per_second: 19.603
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.6404510736465454
Out-of-domain eval_accuracy: 0.5135014172758466
Out-of-domain eval_runtime: 45.3308
Out-of-domain eval_samples_per_second: 147.869
Out-of-domain eval_steps_per_second: 18.486
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 2.9110989570617676
In-domain eval_accuracy: 0.5173341303048417
In-domain eval_runtime: 45.8589
In-domain eval_samples_per_second: 145.926
In-domain eval_steps_per_second: 18.252
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 2.753601551055908
Out-of-domain eval_accuracy: 0.530210353573027
Out-of-domain eval_runtime: 46.1338
Out-of-domain eval_samples_per_second: 145.295
Out-of-domain eval_steps_per_second: 18.165
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.3234763145446777
In-domain eval_accuracy: 0.511655708308428
In-domain eval_runtime: 45.4442
In-domain eval_samples_per_second: 147.258
In-domain eval_steps_per_second: 18.418
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.128319501876831
Out-of-domain eval_accuracy: 0.5249888109801581
Out-of-domain eval_runtime: 46.3528
Out-of-domain eval_samples_per_second: 144.608
Out-of-domain eval_steps_per_second: 18.079
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.454296588897705
In-domain eval_accuracy: 0.5131500298864315
In-domain eval_runtime: 45.6176
In-domain eval_samples_per_second: 146.698
In-domain eval_steps_per_second: 18.348
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.258916139602661
Out-of-domain eval_accuracy: 0.5218558854244368
Out-of-domain eval_runtime: 46.3636
Out-of-domain eval_samples_per_second: 144.575
Out-of-domain eval_steps_per_second: 18.075
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.529592514038086
In-domain eval_accuracy: 0.5134488942020323
In-domain eval_runtime: 45.6113
In-domain eval_samples_per_second: 146.718
In-domain eval_steps_per_second: 18.351
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.3495237827301025
Out-of-domain eval_accuracy: 0.5284201103983291
Out-of-domain eval_runtime: 46.3377
Out-of-domain eval_samples_per_second: 144.655
Out-of-domain eval_steps_per_second: 18.085
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.523299217224121
In-domain eval_accuracy: 0.5165869695158398
In-domain eval_runtime: 45.5659
In-domain eval_samples_per_second: 146.864
In-domain eval_steps_per_second: 18.369
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.3462867736816406
Out-of-domain eval_accuracy: 0.5290168581232284
Out-of-domain eval_runtime: 46.319
Out-of-domain eval_samples_per_second: 144.714
Out-of-domain eval_steps_per_second: 18.092
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.510589361190796
In-domain eval_accuracy: 0.5164375373580394
In-domain eval_runtime: 45.523
In-domain eval_samples_per_second: 147.003
In-domain eval_steps_per_second: 18.386
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.3351306915283203
Out-of-domain eval_accuracy: 0.5294644189169029
Out-of-domain eval_runtime: 46.4096
Out-of-domain eval_samples_per_second: 144.431
Out-of-domain eval_steps_per_second: 18.057
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.4669859409332275
In-domain eval_accuracy: 0.5164375373580394
In-domain eval_runtime: 45.5555
In-domain eval_samples_per_second: 146.898
In-domain eval_steps_per_second: 18.373
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.2855420112609863
Out-of-domain eval_accuracy: 0.5285692973295539
Out-of-domain eval_runtime: 46.4939
Out-of-domain eval_samples_per_second: 144.17
Out-of-domain eval_steps_per_second: 18.024
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.449528694152832
In-domain eval_accuracy: 0.5170352659892409
In-domain eval_runtime: 45.5376
In-domain eval_samples_per_second: 146.955
In-domain eval_steps_per_second: 18.38
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.2643444538116455
Out-of-domain eval_accuracy: 0.5281217365358795
Out-of-domain eval_runtime: 46.5676
Out-of-domain eval_samples_per_second: 143.941
Out-of-domain eval_steps_per_second: 17.995
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 3.4367425441741943
In-domain eval_accuracy: 0.5198744769874477
In-domain eval_runtime: 45.5963
In-domain eval_samples_per_second: 146.766
In-domain eval_steps_per_second: 18.357
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 3.2538881301879883
Out-of-domain eval_accuracy: 0.5317022228852752
Out-of-domain eval_runtime: 46.5187
Out-of-domain eval_samples_per_second: 144.093
Out-of-domain eval_steps_per_second: 18.014
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 2.216885566711426
In-domain eval_accuracy: 0.5827854154213987
In-domain eval_runtime: 45.6662
In-domain eval_samples_per_second: 146.542
In-domain eval_steps_per_second: 18.329
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 2.154782295227051
Out-of-domain eval_accuracy: 0.6006265851111443
Out-of-domain eval_runtime: 46.6081
Out-of-domain eval_samples_per_second: 143.816
Out-of-domain eval_steps_per_second: 17.98
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 4.08625602722168
In-domain eval_accuracy: 0.5811416616855948
In-domain eval_runtime: 45.6299
In-domain eval_samples_per_second: 146.658
In-domain eval_steps_per_second: 18.343
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 4.17327880859375
Out-of-domain eval_accuracy: 0.5892883783380576
Out-of-domain eval_runtime: 46.5086
Out-of-domain eval_samples_per_second: 144.124
Out-of-domain eval_steps_per_second: 18.018
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.178555011749268
In-domain eval_accuracy: 0.5786013150029886
In-domain eval_runtime: 45.5141
In-domain eval_samples_per_second: 147.031
In-domain eval_steps_per_second: 18.39
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.157390117645264
Out-of-domain eval_accuracy: 0.5866030135760107
Out-of-domain eval_runtime: 46.3577
Out-of-domain eval_samples_per_second: 144.593
Out-of-domain eval_steps_per_second: 18.077
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.303643703460693
In-domain eval_accuracy: 0.5790496114763897
In-domain eval_runtime: 45.6008
In-domain eval_samples_per_second: 146.752
In-domain eval_steps_per_second: 18.355
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.352570533752441
Out-of-domain eval_accuracy: 0.58719976130091
Out-of-domain eval_runtime: 46.4259
Out-of-domain eval_samples_per_second: 144.381
Out-of-domain eval_steps_per_second: 18.05
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.291120529174805
In-domain eval_accuracy: 0.5797967722653915
In-domain eval_runtime: 45.58
In-domain eval_samples_per_second: 146.819
In-domain eval_steps_per_second: 18.363
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.357293605804443
Out-of-domain eval_accuracy: 0.5857078919886618
Out-of-domain eval_runtime: 46.5026
Out-of-domain eval_samples_per_second: 144.142
Out-of-domain eval_steps_per_second: 18.02
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.291633129119873
In-domain eval_accuracy: 0.5784518828451883
In-domain eval_runtime: 45.5559
In-domain eval_samples_per_second: 146.897
In-domain eval_steps_per_second: 18.373
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.3678202629089355
Out-of-domain eval_accuracy: 0.5854095181262121
Out-of-domain eval_runtime: 46.4055
Out-of-domain eval_samples_per_second: 144.444
Out-of-domain eval_steps_per_second: 18.058
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.33250093460083
In-domain eval_accuracy: 0.5759115361625822
In-domain eval_runtime: 45.5669
In-domain eval_samples_per_second: 146.861
In-domain eval_steps_per_second: 18.369
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.415322303771973
Out-of-domain eval_accuracy: 0.5836192749515142
Out-of-domain eval_runtime: 46.5029
Out-of-domain eval_samples_per_second: 144.142
Out-of-domain eval_steps_per_second: 18.02
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.366899013519287
In-domain eval_accuracy: 0.574566646742379
In-domain eval_runtime: 45.6581
In-domain eval_samples_per_second: 146.568
In-domain eval_steps_per_second: 18.332
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.453085422515869
Out-of-domain eval_accuracy: 0.5833209010890646
Out-of-domain eval_runtime: 46.4512
Out-of-domain eval_samples_per_second: 144.302
Out-of-domain eval_steps_per_second: 18.04
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.397993087768555
In-domain eval_accuracy: 0.5730723251643753
In-domain eval_runtime: 45.6563
In-domain eval_samples_per_second: 146.574
In-domain eval_steps_per_second: 18.333
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.487315654754639
Out-of-domain eval_accuracy: 0.583022527226615
Out-of-domain eval_runtime: 46.4003
Out-of-domain eval_samples_per_second: 144.46
Out-of-domain eval_steps_per_second: 18.06
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 5.4393792152404785
In-domain eval_accuracy: 0.5733711894799761
In-domain eval_runtime: 45.5747
In-domain eval_samples_per_second: 146.836
In-domain eval_steps_per_second: 18.365
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 5.534802436828613
Out-of-domain eval_accuracy: 0.582127405639266
Out-of-domain eval_runtime: 46.4246
Out-of-domain eval_samples_per_second: 144.385
Out-of-domain eval_steps_per_second: 18.051
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 3.5838334560394287
In-domain eval_accuracy: 0.576658696951584
In-domain eval_runtime: 45.5582
In-domain eval_samples_per_second: 146.889
In-domain eval_steps_per_second: 18.372
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 3.2570409774780273
Out-of-domain eval_accuracy: 0.5883932567507086
Out-of-domain eval_runtime: 46.4587
Out-of-domain eval_samples_per_second: 144.279
Out-of-domain eval_steps_per_second: 18.038
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.436973571777344
In-domain eval_accuracy: 0.5721757322175732
In-domain eval_runtime: 45.5541
In-domain eval_samples_per_second: 146.902
In-domain eval_steps_per_second: 18.374
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.082960605621338
Out-of-domain eval_accuracy: 0.5845143965388632
Out-of-domain eval_runtime: 46.4253
Out-of-domain eval_samples_per_second: 144.382
Out-of-domain eval_steps_per_second: 18.05
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.571469783782959
In-domain eval_accuracy: 0.568738792588165
In-domain eval_runtime: 45.6199
In-domain eval_samples_per_second: 146.69
In-domain eval_steps_per_second: 18.347
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.259609699249268
Out-of-domain eval_accuracy: 0.5861554527823363
Out-of-domain eval_runtime: 46.4542
Out-of-domain eval_samples_per_second: 144.293
Out-of-domain eval_steps_per_second: 18.039
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.514957427978516
In-domain eval_accuracy: 0.5700836820083682
In-domain eval_runtime: 45.5395
In-domain eval_samples_per_second: 146.95
In-domain eval_steps_per_second: 18.38
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.2570576667785645
Out-of-domain eval_accuracy: 0.5813814709831419
Out-of-domain eval_runtime: 46.4422
Out-of-domain eval_samples_per_second: 144.33
Out-of-domain eval_steps_per_second: 18.044
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.423370361328125
In-domain eval_accuracy: 0.5690376569037657
In-domain eval_runtime: 45.5952
In-domain eval_samples_per_second: 146.77
In-domain eval_steps_per_second: 18.357
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.186568737030029
Out-of-domain eval_accuracy: 0.5810830971206923
Out-of-domain eval_runtime: 46.4055
Out-of-domain eval_samples_per_second: 144.444
Out-of-domain eval_steps_per_second: 18.058
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.397609710693359
In-domain eval_accuracy: 0.5697848176927675
In-domain eval_runtime: 45.4897
In-domain eval_samples_per_second: 147.11
In-domain eval_steps_per_second: 18.4
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.163900375366211
Out-of-domain eval_accuracy: 0.5812322840519171
Out-of-domain eval_runtime: 46.33
Out-of-domain eval_samples_per_second: 144.679
Out-of-domain eval_steps_per_second: 18.088
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.393612861633301
In-domain eval_accuracy: 0.5679916317991632
In-domain eval_runtime: 45.5014
In-domain eval_samples_per_second: 147.072
In-domain eval_steps_per_second: 18.395
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.150180339813232
Out-of-domain eval_accuracy: 0.5798896016708937
Out-of-domain eval_runtime: 46.251
Out-of-domain eval_samples_per_second: 144.927
Out-of-domain eval_steps_per_second: 18.119
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.396651744842529
In-domain eval_accuracy: 0.5676927674835625
In-domain eval_runtime: 45.3772
In-domain eval_samples_per_second: 147.475
In-domain eval_steps_per_second: 18.445
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.147037506103516
Out-of-domain eval_accuracy: 0.579591227808444
Out-of-domain eval_runtime: 46.3059
Out-of-domain eval_samples_per_second: 144.755
Out-of-domain eval_steps_per_second: 18.097
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.449057102203369
In-domain eval_accuracy: 0.567095038852361
In-domain eval_runtime: 45.4802
In-domain eval_samples_per_second: 147.141
In-domain eval_steps_per_second: 18.404
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.193150520324707
Out-of-domain eval_accuracy: 0.5785469192898702
Out-of-domain eval_runtime: 46.3336
Out-of-domain eval_samples_per_second: 144.668
Out-of-domain eval_steps_per_second: 18.086
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 6.472430229187012
In-domain eval_accuracy: 0.5673939031679618
In-domain eval_runtime: 45.5064
In-domain eval_samples_per_second: 147.056
In-domain eval_steps_per_second: 18.393
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 6.210341453552246
Out-of-domain eval_accuracy: 0.5778009846337461
Out-of-domain eval_runtime: 46.3655
Out-of-domain eval_samples_per_second: 144.569
Out-of-domain eval_steps_per_second: 18.074
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 2.268444538116455
In-domain eval_accuracy: 0.6693066347878064
In-domain eval_runtime: 45.4129
In-domain eval_samples_per_second: 147.359
In-domain eval_steps_per_second: 18.431
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 2.0233514308929443
Out-of-domain eval_accuracy: 0.7025212591376996
Out-of-domain eval_runtime: 46.2179
Out-of-domain eval_samples_per_second: 145.03
Out-of-domain eval_steps_per_second: 18.132
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.421013832092285
In-domain eval_accuracy: 0.6687089061566049
In-domain eval_runtime: 45.5202
In-domain eval_samples_per_second: 147.012
In-domain eval_steps_per_second: 18.387
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.182169437408447
Out-of-domain eval_accuracy: 0.6899895569148142
Out-of-domain eval_runtime: 46.3461
Out-of-domain eval_samples_per_second: 144.629
Out-of-domain eval_steps_per_second: 18.081
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.731046199798584
In-domain eval_accuracy: 0.6594441123729826
In-domain eval_runtime: 45.4785
In-domain eval_samples_per_second: 147.146
In-domain eval_steps_per_second: 18.404
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.452373027801514
Out-of-domain eval_accuracy: 0.6835745188721468
Out-of-domain eval_runtime: 46.3185
Out-of-domain eval_samples_per_second: 144.715
Out-of-domain eval_steps_per_second: 18.092
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.7829484939575195
In-domain eval_accuracy: 0.6558577405857741
In-domain eval_runtime: 45.4814
In-domain eval_samples_per_second: 147.137
In-domain eval_steps_per_second: 18.403
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.479320526123047
Out-of-domain eval_accuracy: 0.6776070416231538
Out-of-domain eval_runtime: 46.2652
Out-of-domain eval_samples_per_second: 144.882
Out-of-domain eval_steps_per_second: 18.113
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.8393731117248535
In-domain eval_accuracy: 0.6534668260609683
In-domain eval_runtime: 45.4765
In-domain eval_samples_per_second: 147.153
In-domain eval_steps_per_second: 18.405
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.538915634155273
Out-of-domain eval_accuracy: 0.6762643592421304
Out-of-domain eval_runtime: 46.3224
Out-of-domain eval_samples_per_second: 144.703
Out-of-domain eval_steps_per_second: 18.091
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.889718532562256
In-domain eval_accuracy: 0.6536162582187687
In-domain eval_runtime: 45.4616
In-domain eval_samples_per_second: 147.201
In-domain eval_steps_per_second: 18.411
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.581279754638672
Out-of-domain eval_accuracy: 0.67656273310458
Out-of-domain eval_runtime: 46.3039
Out-of-domain eval_samples_per_second: 144.761
Out-of-domain eval_steps_per_second: 18.098
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 4.9488205909729
In-domain eval_accuracy: 0.6548117154811716
In-domain eval_runtime: 45.4893
In-domain eval_samples_per_second: 147.112
In-domain eval_steps_per_second: 18.4
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.634270668029785
Out-of-domain eval_accuracy: 0.6771594808294793
Out-of-domain eval_runtime: 46.2206
Out-of-domain eval_samples_per_second: 145.022
Out-of-domain eval_steps_per_second: 18.13
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 5.0056562423706055
In-domain eval_accuracy: 0.6557083084279737
In-domain eval_runtime: 45.3947
In-domain eval_samples_per_second: 147.418
In-domain eval_steps_per_second: 18.438
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.68603515625
Out-of-domain eval_accuracy: 0.6768611069670297
Out-of-domain eval_runtime: 46.1519
Out-of-domain eval_samples_per_second: 145.238
Out-of-domain eval_steps_per_second: 18.157
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 5.058694362640381
In-domain eval_accuracy: 0.6536162582187687
In-domain eval_runtime: 45.4531
In-domain eval_samples_per_second: 147.229
In-domain eval_steps_per_second: 18.415
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.734646797180176
Out-of-domain eval_accuracy: 0.6758167984484559
Out-of-domain eval_runtime: 46.3627
Out-of-domain eval_samples_per_second: 144.577
Out-of-domain eval_steps_per_second: 18.075
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 5.109732151031494
In-domain eval_accuracy: 0.6521219366407651
In-domain eval_runtime: 45.471
In-domain eval_samples_per_second: 147.171
In-domain eval_steps_per_second: 18.407
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 4.784972190856934
Out-of-domain eval_accuracy: 0.6743249291362077
Out-of-domain eval_runtime: 46.2069
Out-of-domain eval_samples_per_second: 145.065
Out-of-domain eval_steps_per_second: 18.136
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 3.38637638092041
In-domain eval_accuracy: 0.684997011356844
In-domain eval_runtime: 45.4267
In-domain eval_samples_per_second: 147.314
In-domain eval_steps_per_second: 18.425
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 2.8991341590881348
Out-of-domain eval_accuracy: 0.7189318215724303
Out-of-domain eval_runtime: 46.2173
Out-of-domain eval_samples_per_second: 145.032
Out-of-domain eval_steps_per_second: 18.132
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 4.764044761657715
In-domain eval_accuracy: 0.68335325762104
In-domain eval_runtime: 45.4871
In-domain eval_samples_per_second: 147.119
In-domain eval_steps_per_second: 18.401
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.327671527862549
Out-of-domain eval_accuracy: 0.7108757272862897
Out-of-domain eval_runtime: 46.2338
Out-of-domain eval_samples_per_second: 144.981
Out-of-domain eval_steps_per_second: 18.125
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 4.955028533935547
In-domain eval_accuracy: 0.6823072325164375
In-domain eval_runtime: 45.4111
In-domain eval_samples_per_second: 147.365
In-domain eval_steps_per_second: 18.432
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.4862380027771
Out-of-domain eval_accuracy: 0.7081903625242428
Out-of-domain eval_runtime: 46.1724
Out-of-domain eval_samples_per_second: 145.173
Out-of-domain eval_steps_per_second: 18.149
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.0297369956970215
In-domain eval_accuracy: 0.6814106395696354
In-domain eval_runtime: 45.4011
In-domain eval_samples_per_second: 147.397
In-domain eval_steps_per_second: 18.436
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.575053691864014
Out-of-domain eval_accuracy: 0.7061017454870954
Out-of-domain eval_runtime: 46.1565
Out-of-domain eval_samples_per_second: 145.223
Out-of-domain eval_steps_per_second: 18.156
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.104564189910889
In-domain eval_accuracy: 0.6814106395696354
In-domain eval_runtime: 45.3469
In-domain eval_samples_per_second: 147.574
In-domain eval_steps_per_second: 18.458
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.6446380615234375
Out-of-domain eval_accuracy: 0.7078919886617933
Out-of-domain eval_runtime: 46.2258
Out-of-domain eval_samples_per_second: 145.006
Out-of-domain eval_steps_per_second: 18.128
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.151644229888916
In-domain eval_accuracy: 0.6826060968320382
In-domain eval_runtime: 45.363
In-domain eval_samples_per_second: 147.521
In-domain eval_steps_per_second: 18.451
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.724886894226074
Out-of-domain eval_accuracy: 0.7071460540056691
Out-of-domain eval_runtime: 46.1982
Out-of-domain eval_samples_per_second: 145.092
Out-of-domain eval_steps_per_second: 18.139
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.209794998168945
In-domain eval_accuracy: 0.6826060968320382
In-domain eval_runtime: 45.509
In-domain eval_samples_per_second: 147.048
In-domain eval_steps_per_second: 18.392
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.78236722946167
Out-of-domain eval_accuracy: 0.7062509324183202
Out-of-domain eval_runtime: 46.2662
Out-of-domain eval_samples_per_second: 144.879
Out-of-domain eval_steps_per_second: 18.113
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.261279582977295
In-domain eval_accuracy: 0.6821578003586372
In-domain eval_runtime: 45.4459
In-domain eval_samples_per_second: 147.252
In-domain eval_steps_per_second: 18.418
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.831425189971924
Out-of-domain eval_accuracy: 0.7061017454870954
Out-of-domain eval_runtime: 46.2769
Out-of-domain eval_samples_per_second: 144.846
Out-of-domain eval_steps_per_second: 18.108
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.306714057922363
In-domain eval_accuracy: 0.6827555289898386
In-domain eval_runtime: 45.3607
In-domain eval_samples_per_second: 147.529
In-domain eval_steps_per_second: 18.452
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.87397575378418
Out-of-domain eval_accuracy: 0.7064001193495449
Out-of-domain eval_runtime: 46.1713
Out-of-domain eval_samples_per_second: 145.177
Out-of-domain eval_steps_per_second: 18.15
Out-of-domain epoch: 40.0


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 5.347441673278809
In-domain eval_accuracy: 0.6823072325164375
In-domain eval_runtime: 45.4783
In-domain eval_samples_per_second: 147.147
In-domain eval_steps_per_second: 18.404
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.913514614105225
Out-of-domain eval_accuracy: 0.7061017454870954
Out-of-domain eval_runtime: 46.2845
Out-of-domain eval_samples_per_second: 144.822
Out-of-domain eval_steps_per_second: 18.105
Out-of-domain epoch: 40.0


In [None]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.5147489539748954
Maximum in-domain accuracy: 0.5198744769874477
Average out-of-domain accuracy: 0.5265851111442638
Maximum out-of-domain accuracy: 0.5317022228852752


For n=16:
Average in-domain accuracy: 0.5776748356246264
Maximum in-domain accuracy: 0.5827854154213987
Average out-of-domain accuracy: 0.5866925257347456
Maximum out-of-domain accuracy: 0.6006265851111443


For n=32:
Average in-domain accuracy: 0.5696652719665272
Maximum in-domain accuracy: 0.576658696951584
Average out-of-domain accuracy: 0.5818588691630613
Maximum out-of-domain accuracy: 0.5883932567507086


For n=64:
Average in-domain accuracy: 0.6576658696951584
Maximum in-domain accuracy: 0.6693066347878064
Average out-of-domain accuracy: 0.6810681784275697
Maximum out-of-domain accuracy: 0.7025212591376996


For n=128:
Average in-domain accuracy: 0.6825911536162582
Maximum in-domain accuracy: 0.684997011356844
Average out-of-domain accuracy: 0.7083992242279576
Maximum out-of-

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv("../Results/pbft_mnli_baseline_results.csv", index=False)