#0. Header

In [98]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [99]:
!pip install safetensors
!pip install datasets
!pip install accelerate -U



In [100]:
!pip install evaluate



In [101]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset, DatasetDict
from transformers import TrainingArguments, Trainer
from safetensors.torch import load
from sklearn.metrics import f1_score
import numpy as np
import evaluate

# 1. Pre-training on Entailment Template
We pre-train our Roberta-base model on MNLI dataset. As MNLI dataset is an entailmnent dataset it allows the LM to learn the template with ample of data. We will fine-tune on multiclassifier downstream tasks.

**Load Model and MNLI Dataset**

In [4]:
# Loading the tokenizer and the model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Loading the MNLI dataset
dataset = load_dataset('glue', 'mnli')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

**Filter MNLI for only Entailment and Contradiction**

In [5]:
def filter_and_map_labels(examples):
    entailment_idx = 0
    contradiction_idx = 1
    filtered_examples = {'premise': [], 'hypothesis': [], 'label': []}
    for premise, hypothesis, label in zip(examples['premise'], examples['hypothesis'], examples['label']):
        if label == entailment_idx:  # entailment
            filtered_examples['premise'].append(premise)
            filtered_examples['hypothesis'].append(hypothesis)
            filtered_examples['label'].append(0)  # entailment
        elif label == contradiction_idx:  # contradiction
            filtered_examples['premise'].append(premise)
            filtered_examples['hypothesis'].append(hypothesis)
            filtered_examples['label'].append(1)  # contradiction
    return filtered_examples

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [7]:
train_dataset = dataset['train'].map(filter_and_map_labels, batched=True, remove_columns=dataset['train'].column_names)
validation_dataset = dataset['validation_matched'].map(filter_and_map_labels, batched=True, remove_columns=dataset['validation_matched'].column_names)
test_dataset = dataset['validation_mismatched'].map(filter_and_map_labels, batched=True, remove_columns=dataset['validation_mismatched'].column_names)


Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

**Tokenize the Dataset**

In [8]:
def tokenize(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

In [9]:
# Tokenizing the train and validation dataset
token_train = train_dataset.map(tokenize, batched=True)
token_valid = validation_dataset.map(tokenize, batched=True)
token_test = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/261799 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/6602 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/6592 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [10]:
# Set format for PyTorch
token_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
token_valid.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
token_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

**Setting the Training Arguments**

In [11]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/Tune_Param',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/Tune_Param',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

**Training**

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=token_train,
    eval_dataset=token_valid
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6984,0.694228


Checkpoint destination directory /content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/Tune_Param/checkpoint-32725 already exists and is non-empty. Saving will proceed but saved results may be invalid.


**Evaluating**

In [None]:
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project")
trainer.evaluate()

# 2. Fine-Tuning on Downstream Task
Here we Fine-Tune on a Downstream task. In this case our downstream task is AG news

In [103]:
with open("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/model.safetensors", "rb") as f:
    model_bytes = f.read()

state_dict = load(model_bytes)

In [104]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model_finetuned = RobertaForSequenceClassification.from_pretrained('roberta-base', state_dict=state_dict)

In [105]:
def tokenize(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

## 2.1 AG News

####**Load the Dataset**

In [102]:
agnews_dataset = load_dataset('ag_news')

####**Split Train dataset into Train and Validation - K-shot**

In [106]:
train_dataset = agnews_dataset['train']

# Setting the K value here
k = 8

indices = np.random.permutation(len(train_dataset))

# Split indices into training and validation indices
train_indices = indices[:k]
valid_indices = indices[k:k+k]


new_train_dataset = train_dataset.select(train_indices)
new_valid_dataset = train_dataset.select(valid_indices)


updated_agnews = DatasetDict({
    'train': new_train_dataset,
    'valid': new_valid_dataset,
    'test': agnews_dataset['test']
})

In [107]:
updated_agnews

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

####**Convert into Entailment Template**

In [108]:
def agnews_to_entailment(examples):
    label_map = {
        0: 'it is world news',
        1: 'it is sports news',
        2: 'it is business news',
        3: 'it is science news'
    }

    filtered_examples = {'premise': [], 'hypothesis': [], 'label': []}

    for text, label in zip(examples['text'], examples['label']):

        filtered_examples['premise'].append(text)
        filtered_examples['hypothesis'].append(label_map[label])
        filtered_examples['label'].append(0)

        for other_label in label_map:
            if label != other_label:
                filtered_examples['premise'].append(text)
                filtered_examples['hypothesis'].append(label_map[other_label])
                filtered_examples['label'].append(1)

    return filtered_examples


In [109]:
train_agnews = updated_agnews['train'].map(agnews_to_entailment, batched=True, remove_columns=updated_agnews['train'].column_names)
valid_agnews = updated_agnews['valid'].map(agnews_to_entailment, batched=True, remove_columns=updated_agnews['valid'].column_names)
test_agnews = updated_agnews['test'].map(agnews_to_entailment, batched=True, remove_columns=updated_agnews['test'].column_names)

####**Explore the new Datasets**

In [80]:
train_agnews

Dataset({
    features: ['label', 'premise', 'hypothesis'],
    num_rows: 32
})

In [81]:
valid_agnews

Dataset({
    features: ['label', 'premise', 'hypothesis'],
    num_rows: 32
})

In [82]:
test_agnews

Dataset({
    features: ['label', 'premise', 'hypothesis'],
    num_rows: 30400
})

####**Tokenize the Dataset**

In [110]:
# Tokenizing the train and validation dataset
token_train = train_agnews.map(tokenize, batched=True)
token_valid = valid_agnews.map(tokenize, batched=True)
token_test = test_agnews.map(tokenize, batched=True)

In [111]:
# Set format for PyTorch
token_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
token_valid.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
token_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

####**Training**

In [85]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [86]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/ag_news/Tune_Param',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/ag_news/Tune_Param',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [87]:
trainer = Trainer(
    model=model_finetuned,
    args=training_args,
    train_dataset=token_train,
    eval_dataset=token_valid,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [88]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.43218,0.25


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.43218,0.25
2,No log,1.409195,0.25
3,1.366600,1.371439,0.25


TrainOutput(global_step=12, training_loss=1.3401504357655842, metrics={'train_runtime': 247.8863, 'train_samples_per_second': 0.387, 'train_steps_per_second': 0.048, 'total_flos': 6314665328640.0, 'train_loss': 1.3401504357655842, 'epoch': 3.0})

####**Saving and Evaluating**

In [89]:
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/ag_news")
trainer.evaluate()

{'eval_loss': 1.37143874168396,
 'eval_accuracy': 0.25,
 'eval_runtime': 21.5467,
 'eval_samples_per_second': 1.485,
 'eval_steps_per_second': 0.186,
 'epoch': 3.0}

#3. Evaluation AG News

In [112]:
with open("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/ag_news/model.safetensors", "rb") as f:
    model_bytes = f.read()

state_dict = load(model_bytes)

In [113]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model_finetuned = RobertaForSequenceClassification.from_pretrained('roberta-base', state_dict=state_dict)

In [114]:
def tokenize(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

In [None]:
import torch

model_finetuned.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_finetuned.to(device)

test_loader = torch.utils.data.DataLoader(token_test, batch_size=8)

true_labels = []
predictions = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}

        outputs = model_finetuned(**inputs)

        logits = outputs.logits.detach().cpu().numpy()

        label_ids = batch['label'].to('cpu').numpy()

        true_labels.extend(label_ids)
        predictions.extend(np.argmax(logits, axis=1))


f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1}")