## Installation of libraries

In [1]:
%pip install datasets accelerate>=0.21.0 -U

## Import required modules

In [2]:
seed = 42
import random
import torch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, concatenate_datasets, Dataset, load_metric
import numpy as np

## Load and prepare dataset

In [3]:
from datasets import load_dataset

# Load the MNLI dataset
dataset = load_dataset("multi_nli")

# Function to print the percentage distribution of labels
def print_label_distribution(data, name):
    # Count occurrences of each label
    label_counts = {}
    for item in data:
        label = item['label']
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

    # Calculate and print percentages
    total = len(data)
    print(f"Label distribution in {name}:")
    for label, count in label_counts.items():
        percentage = (count / total) * 100
        print(f"Label {label}: {percentage:.2f}%")

# Print initial label distributions
print_label_distribution(dataset['train'], 'train (before filtering)')
print_label_distribution(dataset['validation_matched'], 'validation_matched (before filtering)')
print_label_distribution(dataset['validation_mismatched'], 'validation_mismatched (before filtering)')

# Filtering the dataset
def filter_data(data):
    # Filter out entries with 'label' = 0
    filtered_data = [item for item in data if item['label'] != 1]
    return filtered_data

# Apply the filtering function to each dataset split
train_filtered = filter_data(dataset['train'])
validation_matched_filtered = filter_data(dataset['validation_matched'])
validation_mismatched_filtered = filter_data(dataset['validation_mismatched'])

# Print label distributions after filtering
print_label_distribution(train_filtered, 'train (after filtering)')
print_label_distribution(validation_matched_filtered, 'validation_matched (after filtering)')
print_label_distribution(validation_mismatched_filtered, 'validation_mismatched (after filtering)')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Label distribution in train (before filtering):
Label 1: 33.33%
Label 0: 33.33%
Label 2: 33.33%
Label distribution in validation_matched (before filtering):
Label 1: 31.82%
Label 2: 32.74%
Label 0: 35.45%
Label distribution in validation_mismatched (before filtering):
Label 2: 32.95%
Label 0: 35.22%
Label 1: 31.82%
Label distribution in train (after filtering):
Label 0: 50.00%
Label 2: 50.00%
Label distribution in validation_matched (after filtering):
Label 2: 48.01%
Label 0: 51.99%
Label distribution in validation_mismatched (after filtering):
Label 2: 48.34%
Label 0: 51.66%


In [4]:
# Function to change 'label' = 2 to 'label' = 1
def change_labels(data):
    for item in data:
        if item['label'] == 2:
            item['label'] = 1
    return data

# Change labels in filtered datasets
train_filtered = change_labels(train_filtered)
validation_matched_filtered = change_labels(validation_matched_filtered)
validation_mismatched_filtered = change_labels(validation_mismatched_filtered)

# Print label distributions after changing labels
print_label_distribution(train_filtered, 'train (after label change)')
print_label_distribution(validation_matched_filtered, 'validation_matched (after label change)')
print_label_distribution(validation_mismatched_filtered, 'validation_mismatched (after label change)')

Label distribution in train (after label change):
Label 0: 50.00%
Label 1: 50.00%
Label distribution in validation_matched (after label change):
Label 1: 48.01%
Label 0: 51.99%
Label distribution in validation_mismatched (after label change):
Label 1: 48.34%
Label 0: 51.66%


In [5]:
dataset.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched'])

In [6]:
type(train_filtered), len(train_filtered), type(validation_matched_filtered), len(validation_matched_filtered), type(validation_mismatched_filtered), len(validation_mismatched_filtered)

(list, 261802, list, 6692, list, 6703)

In [7]:
train_filtered[0]

{'promptID': 101457,
 'pairID': '101457e',
 'premise': 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him',
 'premise_binary_parse': '( you ( ( know ( during ( ( ( the season ) and ) ( i guess ) ) ) ) ( at ( at ( ( your level ) ( uh ( you ( ( ( lose them ) ( to ( the ( next level ) ) ) ) ( if ( ( if ( they ( decide ( to ( recall ( the ( the ( parent team ) ) ) ) ) ) ) ) ( ( the Braves ) ( decide ( to ( call ( to ( ( recall ( a guy ) ) ( from ( ( triple A ) ( ( ( then ( ( a ( double ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) and ) ( ( a ( single ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )',
 'premise_parse': '(ROOT (S (NP (PRP you)) (VP (VBP know) (PP (IN during) (NP (NP (DT the) (NN season)) (CC an

In [8]:
import random

def prepare_few_shot_data(data, num_samples, seed=42):
    """
    Prepare a few-shot dataset ensuring equal representation of labels and deterministic output.

    Args:
    data (list): The filtered training dataset.
    num_samples (int): Total number of samples to prepare (must be divisible by number of labels).
    seed (int): Seed for random functions to ensure deterministic results.

    Returns:
    list: A dataset with equal representation of labels.
    """
    # Set the seed for reproducibility
    random.seed(seed)

    # Determine number of samples per label based on requested total samples and number of labels
    label_to_items = {}
    for item in data:
        label = item['label']
        if label not in label_to_items:
            label_to_items[label] = []
        label_to_items[label].append(item)

    samples_per_label = num_samples // len(label_to_items)  # assumes num_samples is evenly divisible by number of labels

    few_shot_samples = []
    for label, items in label_to_items.items():
        if len(items) < samples_per_label:
            raise ValueError(f"Not enough samples for label {label}: {len(items)} available, {samples_per_label} needed.")
        few_shot_samples.extend(random.sample(items, samples_per_label))

    return few_shot_samples

# Example usage:
# Assuming `train_filtered` is already defined and appropriately filtered from previous steps
num_samples = 128  # Total samples, make sure it's divisible by the number of labels (for simplicity assuming 3 labels here)
few_shot_data = prepare_few_shot_data(train_filtered, num_samples)
print(f"Prepared few-shot dataset with {len(few_shot_data)} samples.")


def print_label_distribution(data, name):
    """
    Print the percentage distribution of labels in the dataset.

    Args:
    data (list): The dataset to analyze.
    name (str): A descriptive name for the dataset to include in the printout.
    """
    label_counts = {}
    total = 0
    for item in data:
        label = item['label']
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1
        total += 1

    print(f"Label distribution in {name}:")
    for label, count in label_counts.items():
        percentage = (count / total) * 100
        print(f"Label {label}: {percentage:.2f}%")

# Example usage:
# Assuming `few_shot_data` is already defined from the few-shot preparation step
print_label_distribution(few_shot_data, "Few-shot Dataset")


Prepared few-shot dataset with 128 samples.
Label distribution in Few-shot Dataset:
Label 0: 50.00%
Label 1: 50.00%


In [9]:
type(few_shot_data), type(validation_matched_filtered), type(validation_mismatched_filtered)

(list, list, list)

In [10]:
len(few_shot_data), len(validation_matched_filtered), len(validation_mismatched_filtered)

(128, 6692, 6703)

## Tokenization

In [11]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Function to tokenize a dataset
def tokenize_data(data):
    """
    Tokenize premises and hypotheses in the dataset.

    Args:
    data (list): List of dictionaries with 'premise' and 'hypothesis' keys.

    Returns:
    dict: Tokenized text data suitable for input to the model.
    """
    return tokenizer(
        [entry['premise'] for entry in data],
        [entry['hypothesis'] for entry in data],
        padding=True, truncation=True, return_tensors="pt"
    )


## Model configuration

In [12]:
def convert_to_dataset(tokenized_data, labels):
    """
    Convert tokenized data and labels into a Hugging Face Dataset.
    """
    return Dataset.from_dict({
        'input_ids': tokenized_data['input_ids'],
        'attention_mask': tokenized_data['attention_mask'],
        'labels': torch.tensor(labels, dtype=torch.long)
    })


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", num_labels=2)
model.to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

def load_pretrained_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-125m", num_labels=2)
    model.to(device)
    return model

def train_and_monitor(model, train_dataset, few_shot_size):
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{few_shot_size}",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=40,
        weight_decay=0.01,
        logging_dir=f'./logs_{few_shot_size}',  # Log metrics to a directory
        logging_strategy="epoch",  # Log metrics after each epoch
        evaluation_strategy="no"  # Do not evaluate on the eval_dataset during training
    )

    # Trainer without passing eval_dataset
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        compute_metrics=compute_metrics  # Function to compute metrics
    )

    # Perform training
    trainer.train()
    print(f"Training completed for few-shot size {few_shot_size}.")

    # Return the trainer for optional further use
    return trainer

def evaluate_model(model, eval_dataset):
    evaluation_args = TrainingArguments(
        output_dir="./results_eval",
        per_device_eval_batch_size=16,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=evaluation_args,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )
    return trainer.evaluate()

# Tokenize validation datasets
tokenized_validation_matched = tokenize_data(validation_matched_filtered)
tokenized_validation_mismatched = tokenize_data(validation_mismatched_filtered)
val_matched_dataset = convert_to_dataset(tokenized_validation_matched, [item['label'] for item in validation_matched_filtered])
val_mismatched_dataset = convert_to_dataset(tokenized_validation_mismatched, [item['label'] for item in validation_mismatched_filtered])

baseline_results_matched = evaluate_model(model, val_matched_dataset)
baseline_results_mismatched = evaluate_model(model, val_mismatched_dataset)

print("Baseline Accuracy and Loss (Matched):", baseline_results_matched)
print("Baseline Accuracy and Loss (Mismatched):", baseline_results_mismatched)

# Proceed with few-shot training and evaluation
# Training sizes
few_shot_sizes = [2, 16, 32, 64, 128]
for size in few_shot_sizes:
    model = load_pretrained_model()
    few_shot_data = prepare_few_shot_data(train_filtered, size)
    tokenized_few_shot_data = tokenize_data(few_shot_data)
    few_shot_dataset = convert_to_dataset(tokenized_few_shot_data, [item['label'] for item in few_shot_data])

    # Train the model while monitoring training loss and accuracy
    trainer = train_and_monitor(model, few_shot_dataset, size)

    # Manually evaluate on the validation set after training
    results = trainer.evaluate(val_matched_dataset)
    print(f"Post-training evaluation results for size {size}: {results}")

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Baseline Accuracy and Loss (Matched): {'eval_loss': 1.0798803567886353, 'eval_accuracy': 0.4829647340107591, 'eval_runtime': 138.3119, 'eval_samples_per_second': 48.383, 'eval_steps_per_second': 3.029}
Baseline Accuracy and Loss (Mismatched): {'eval_loss': 1.0858485698699951, 'eval_accuracy': 0.48575264806802926, 'eval_runtime': 132.6685, 'eval_samples_per_second': 50.524, 'eval_steps_per_second': 3.158}


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,0.4458
2,0.1745
3,0.1419
4,0.0741
5,0.0393
6,0.0184
7,0.0102
8,0.0048
9,0.0032
10,0.0022


Training completed for few-shot size 2.


Post-training evaluation results for size 2: {'eval_loss': 0.9493414759635925, 'eval_accuracy': 0.5038852361028093, 'eval_runtime': 130.556, 'eval_samples_per_second': 51.258, 'eval_steps_per_second': 3.209, 'epoch': 40.0}


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,0.8747
2,0.6311
3,0.4843
4,0.4127
5,0.3241
6,0.2405
7,0.2046
8,0.1546
9,0.1104
10,0.0875


Training completed for few-shot size 16.


Post-training evaluation results for size 16: {'eval_loss': 0.9662500619888306, 'eval_accuracy': 0.5112074118350269, 'eval_runtime': 132.6833, 'eval_samples_per_second': 50.436, 'eval_steps_per_second': 3.158, 'epoch': 40.0}


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
2,0.7836
4,0.5939
6,0.4939
8,0.3771
10,0.2875
12,0.2207
14,0.1681
16,0.1237
18,0.0875
20,0.0564


Training completed for few-shot size 32.


Post-training evaluation results for size 32: {'eval_loss': 1.026452898979187, 'eval_accuracy': 0.5298864315600718, 'eval_runtime': 132.0633, 'eval_samples_per_second': 50.673, 'eval_steps_per_second': 3.173, 'epoch': 40.0}


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
4,0.7797
8,0.5178
12,0.3601
16,0.2467
20,0.1629
24,0.0976
28,0.0538
32,0.0287
36,0.0137
40,0.0063


Training completed for few-shot size 64.


Post-training evaluation results for size 64: {'eval_loss': 1.3874728679656982, 'eval_accuracy': 0.5812910938433951, 'eval_runtime': 134.8433, 'eval_samples_per_second': 49.628, 'eval_steps_per_second': 3.107, 'epoch': 40.0}


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
8,0.7487
16,0.5163
24,0.3833
32,0.2605
40,0.1532
48,0.0816
56,0.0373
64,0.0172
72,0.006
80,0.002


Training completed for few-shot size 128.


Post-training evaluation results for size 128: {'eval_loss': 1.5067530870437622, 'eval_accuracy': 0.592498505678422, 'eval_runtime': 140.3256, 'eval_samples_per_second': 47.689, 'eval_steps_per_second': 2.986, 'epoch': 40.0}


## Result analysis