#0. Header

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [8]:
!pip install safetensors
!pip install datasets
!pip install accelerate -U



In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from safetensors.torch import load
from sklearn.metrics import f1_score
import numpy as np

# 1. Pre-training on Entailment Template
We pre-train our Roberta-base model on MNLI dataset. As MNLI dataset is an entailmnent dataset it allows the LM to learn the template with ample of data. We will fine-tune on multiclassifier downstream tasks.

**Load Model and MNLI Dataset**

In [20]:
# Loading the tokenizer and the model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

# Loading the MNLI dataset
dataset = load_dataset('glue', 'mnli')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Filter MNLI for only Entailment and Contradiction**

In [23]:
def filter_and_map_labels(examples):
    entailment_idx = 0
    contradiction_idx = 1
    filtered_examples = {'premise': [], 'hypothesis': [], 'label': []}
    for premise, hypothesis, label in zip(examples['premise'], examples['hypothesis'], examples['label']):
        if label == entailment_idx:  # entailment
            filtered_examples['premise'].append(premise)
            filtered_examples['hypothesis'].append(hypothesis)
            filtered_examples['label'].append(0)  # entailment
        elif label == contradiction_idx:  # contradiction
            filtered_examples['premise'].append(premise)
            filtered_examples['hypothesis'].append(hypothesis)
            filtered_examples['label'].append(1)  # contradiction
    return filtered_examples

In [24]:
train_dataset = dataset['train'].map(filter_and_map_labels, batched=True, remove_columns=dataset['train'].column_names)
validation_dataset = dataset['validation_matched'].map(filter_and_map_labels, batched=True, remove_columns=dataset['validation_matched'].column_names)

**Tokenize the Dataset**

In [None]:
def tokenize(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Tokenizing the train and validation dataset
train_dataset = train_dataset.map(tokenize, batched=True)
validation_dataset = validation_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

**Setting the Training Arguments**

In [26]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/Tune_Param',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/Tune_Param',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

**Training**

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.train()

**Evaluating**

In [None]:
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project")
trainer.evaluate()

# Evaluation

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
dataset = load_dataset('glue', 'mnli')
validation_set = dataset['validation']

def tokenize_function(example):
    return tokenizer(example['premise'], example['hypothesis'], padding='max_length', truncation=True, max_length=128)

tokenized_validation_set = validation_set.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
model_original = RobertaForSequenceClassification.from_pretrained('roberta-large')


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

with open("/content/drive/MyDrive/Colab Notebooks/Deep_Learning_Project/model.safetensors", "rb") as f:
    model_bytes = f.read()

state_dict = load(model_bytes)

model_finetuned = RobertaForSequenceClassification.from_pretrained('roberta-large', state_dict=state_dict)

In [None]:
model_finetuned.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_finetuned.to(device)

tokenized_validation_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
validation_loader = torch.utils.data.DataLoader(tokenized_validation_set, batch_size=8)

true_labels = []
predictions = []

with torch.no_grad():
    for batch in validation_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}

        outputs = model_finetuned(**inputs)

        logits = outputs.logits.detach().cpu().numpy()

        label_ids = batch['label'].to('cpu').numpy()

        true_labels.extend(label_ids)
        predictions.extend(np.argmax(logits, axis=1))


f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.8437002753724163
