# **Few shot Fine Tuning on MNLI**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/DL_project/llm_finetuning/notebooks/

/content/drive/MyDrive/DL_project/llm_finetuning/notebooks


In [3]:
!pip install -q transformers accelerate bitsandbytes datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/119.8 MB[0m [31m73.1 MB/s[0m eta [36m0:00:02[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/119.8 MB[0m [31m55.3 MB/s[0m eta [36m0:00:03[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/119.8 MB[0m [31m93.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/119.8 MB[0m [31m93.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/119.8 MB[0m [31

## **Experimental setup**

**Fewshot setup :** Each n example uses 10 different sets of training to avoid bias

N ranges {2, 16, 32, 64, 128}

In [5]:
# load dataset
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch

# for reproducibility
np.random.seed(42)

torch.manual_seed(42)

if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

data = load_dataset("glue", "mnli")
hans_data = load_dataset("hans")

#Below function is taken from: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset

data = binarize_mnli(data, remove_neutral=True)


#function for computing accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Prepare training data and define training config
#!mkdir offload_folder

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Change num_labels to 2 and drop-out hyperparam = 0.1
config = AutoConfig.from_pretrained("facebook/opt-125m", num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)


def manipulate_inputs(batch):
    # Add pattern and verbalizer
    encoding = tokenizer(batch["premise"], batch["hypothesis"],
                         truncation=True, padding="max_length", max_length=128)
    batch["input_ids"] = encoding["input_ids"]  # Already a list
    batch["attention_mask"] = encoding["attention_mask"]  # Already a list
    return batch

data = data.map(manipulate_inputs, batched=True)
hans_data = hans_data.map(manipulate_inputs, batched=True)

# Few-shot setup
n_values = [2, 16, 32, 64, 128]  # number of examples for each class

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["n", "run", "in_domain_accuracy", "out_of_domain_accuracy"])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for n in n_values:
    for run in range(1):  # repeat 10 times for each n

        # Create a copy of the dataset
        data_copy = data.copy()

        # Select n random examples for each class from the original data
        indices_yes = np.where(np.array(data["train"]["label"]) == 0)[0]
        indices_no = np.where(np.array(data["train"]["label"]) == 1)[0]
        indices_yes = np.random.choice(indices_yes, n, replace=False)
        indices_no = np.random.choice(indices_no, n, replace=False)
        indices = np.concatenate([indices_yes, indices_no])

        # Select the examples for the new training set
        train_dataset = data["train"].select(indices)

        # Re-initialize the model at the start of each training cycle
        model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", config=config)
        model.to(device)  # Move the model to the device (gpu if available)

        # Training config
        total_steps = (len(train_dataset) // 32) * 40 # Total steps = (#samples/batch size) * epochs

        training_args = TrainingArguments(
            output_dir="./offload_folder",
            overwrite_output_dir=True,
            num_train_epochs=40,
            per_device_train_batch_size=32,
            learning_rate=1e-5,
            weight_decay=0.0,
            save_steps=10_000,
            save_total_limit=2,
            warmup_steps=int(0.1 * total_steps), # Warmup ratio = 10% of total steps
        )

        # Define the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        # Train the model
        trainer.train()

        # Evaluate in-domain performance
        print(f"Evaluating in-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=data["validation_matched"])

        # Store the in-domain accuracy
        in_domain_accuracy = eval_results["eval_accuracy"]

        # Print the in-domain evaluation results
        for key, value in eval_results.items():
            print(f"In-domain {key}: {value}")

        # Evaluate out-of-domain performance
        print(f"Evaluating out-of-domain performance for n={n}...")
        eval_results = trainer.evaluate(eval_dataset=hans_data["validation"])

        # Store the out-of-domain accuracy
        out_of_domain_accuracy = eval_results["eval_accuracy"]

        # Print the out-of-domain evaluation results
        for key, value in eval_results.items():
            print(f"Out-of-domain {key}: {value}")

        # Add the results to the DataFrame
        new_row = pd.DataFrame({
            "n": [n],
            "run": [run],
            "in_domain_accuracy": [in_domain_accuracy],
            "out_of_domain_accuracy": [out_of_domain_accuracy]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)


Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=2...


In-domain eval_loss: 1.3966827392578125
In-domain eval_accuracy: 0.49222952779438134
In-domain eval_runtime: 68.2881
In-domain eval_samples_per_second: 97.997
In-domain eval_steps_per_second: 12.257
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=2...
Out-of-domain eval_loss: 1.3473879098892212
Out-of-domain eval_accuracy: 0.5000666666666667
Out-of-domain eval_runtime: 337.9896
Out-of-domain eval_samples_per_second: 88.76
Out-of-domain eval_steps_per_second: 11.095
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=16...


In-domain eval_loss: 0.8344621658325195
In-domain eval_accuracy: 0.5932456664674238
In-domain eval_runtime: 74.7191
In-domain eval_samples_per_second: 89.562
In-domain eval_steps_per_second: 11.202
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=16...
Out-of-domain eval_loss: 1.239573359489441
Out-of-domain eval_accuracy: 0.4936
Out-of-domain eval_runtime: 349.4554
Out-of-domain eval_samples_per_second: 85.848
Out-of-domain eval_steps_per_second: 10.731
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=32...


In-domain eval_loss: 0.9722184538841248
In-domain eval_accuracy: 0.6243275552898984
In-domain eval_runtime: 79.907
In-domain eval_samples_per_second: 83.747
In-domain eval_steps_per_second: 10.475
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=32...
Out-of-domain eval_loss: 2.1096720695495605
Out-of-domain eval_accuracy: 0.4982333333333333
Out-of-domain eval_runtime: 360.0104
Out-of-domain eval_samples_per_second: 83.331
Out-of-domain eval_steps_per_second: 10.416
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=64...


In-domain eval_loss: 1.3385143280029297
In-domain eval_accuracy: 0.682904961147639
In-domain eval_runtime: 72.7656
In-domain eval_samples_per_second: 91.966
In-domain eval_steps_per_second: 11.503
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=64...
Out-of-domain eval_loss: 3.291717529296875
Out-of-domain eval_accuracy: 0.4992
Out-of-domain eval_runtime: 351.4506
Out-of-domain eval_samples_per_second: 85.361
Out-of-domain eval_steps_per_second: 10.67
Out-of-domain epoch: 40.0


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Evaluating in-domain performance for n=128...


In-domain eval_loss: 1.8567631244659424
In-domain eval_accuracy: 0.6876867901972504
In-domain eval_runtime: 79.01
In-domain eval_samples_per_second: 84.698
In-domain eval_steps_per_second: 10.594
In-domain epoch: 40.0
Evaluating out-of-domain performance for n=128...
Out-of-domain eval_loss: 4.789870738983154
Out-of-domain eval_accuracy: 0.49993333333333334
Out-of-domain eval_runtime: 355.8604
Out-of-domain eval_samples_per_second: 84.303
Out-of-domain eval_steps_per_second: 10.538
Out-of-domain epoch: 40.0


In [6]:
# Group the results by 'n' and compute the average and maximum performance for each group
grouped_results = results_df.groupby('n')

average_in_domain_accuracy = grouped_results['in_domain_accuracy'].mean()
maximum_in_domain_accuracy = grouped_results['in_domain_accuracy'].max()

average_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].mean()
maximum_out_of_domain_accuracy = grouped_results['out_of_domain_accuracy'].max()

# Print the results for each 'n'
for n in n_values:
    print(f"For n={n}:")
    print(f"Average in-domain accuracy: {average_in_domain_accuracy[n]}")
    print(f"Maximum in-domain accuracy: {maximum_in_domain_accuracy[n]}")
    print(f"Average out-of-domain accuracy: {average_out_of_domain_accuracy[n]}")
    print(f"Maximum out-of-domain accuracy: {maximum_out_of_domain_accuracy[n]}")
    print("\n")

# Compute and print the overall average and maximum performance
print("Overall:")
print(f"Average in-domain accuracy: {results_df['in_domain_accuracy'].mean()}")
print(f"Maximum in-domain accuracy: {results_df['in_domain_accuracy'].max()}")
print(f"Average out-of-domain accuracy: {results_df['out_of_domain_accuracy'].mean()}")
print(f"Maximum out-of-domain accuracy: {results_df['out_of_domain_accuracy'].max()}")


For n=2:
Average in-domain accuracy: 0.49222952779438134
Maximum in-domain accuracy: 0.49222952779438134
Average out-of-domain accuracy: 0.5000666666666667
Maximum out-of-domain accuracy: 0.5000666666666667


For n=16:
Average in-domain accuracy: 0.5932456664674238
Maximum in-domain accuracy: 0.5932456664674238
Average out-of-domain accuracy: 0.4936
Maximum out-of-domain accuracy: 0.4936


For n=32:
Average in-domain accuracy: 0.6243275552898984
Maximum in-domain accuracy: 0.6243275552898984
Average out-of-domain accuracy: 0.4982333333333333
Maximum out-of-domain accuracy: 0.4982333333333333


For n=64:
Average in-domain accuracy: 0.682904961147639
Maximum in-domain accuracy: 0.682904961147639
Average out-of-domain accuracy: 0.4992
Maximum out-of-domain accuracy: 0.4992


For n=128:
Average in-domain accuracy: 0.6876867901972504
Maximum in-domain accuracy: 0.6876867901972504
Average out-of-domain accuracy: 0.49993333333333334
Maximum out-of-domain accuracy: 0.49993333333333334


Overal

In [7]:
!ls

few_shot_context_distillation_mnli_baseline_results.csv  offload_folder
logs							 results
models							 vanilla_mnli_baseline_results.csv


In [8]:
results_df.to_csv("./vanilla_mnli_baseline_results.csv", index=False)

In [9]:
results_df

Unnamed: 0,n,run,in_domain_accuracy,out_of_domain_accuracy
0,2,0,0.49223,0.500067
1,16,0,0.593246,0.4936
2,32,0,0.624328,0.498233
3,64,0,0.682905,0.4992
4,128,0,0.687687,0.499933
