In [6]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
import time
import wandb


In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


loading the corpus

In [None]:
# load dataset builder
dataset = load_dataset("snli")

# access the training, validation, and test splits
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# filter out instances with label -1
train_data = train_data.filter(lambda example: example['label'] != -1)
valid_data = valid_data.filter(lambda example: example['label'] != -1)

# print updated information about the dataset
print(f"Number of training examples after filtering: {len(train_data)}")
print(f"Number of validation examples after filtering: {len(valid_data)}")
print(f"Number of test examples: {len(test_data)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Number of training examples after filtering: 549367
Number of validation examples after filtering: 9842
Number of test examples: 10000


In [None]:
print(train_data[:5]) # 1 neural, 2 contradiction, 0 entailment

{'premise': ['A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'Children smiling and waving at camera', 'Children smiling and waving at camera'], 'hypothesis': ['A person is training his horse for a competition.', 'A person is at a diner, ordering an omelette.', 'A person is outdoors, on a horse.', 'They are smiling at their parents', 'There are children present'], 'label': [1, 2, 0, 1, 0]}


tokenization/dataloader

In [None]:
# load XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
batch_size = 64

# define a function for tokenization
def tokenize_function(example):
    premises = example['premise']
    hypotheses = example['hypothesis']
    labels = example['label']

    # combine corresponding strings from 'premise' and 'hypothesis'
    combined_sentences = [premise + " </s> " + hypothesis for premise, hypothesis in zip(premises, hypotheses)]
    # print(combined_sentences[:5])

    return tokenizer(combined_sentences, padding=True, truncation=True)

# apply the mapping function
tokenized_dataset_train = train_data.map(tokenize_function, batched=True)
tokenized_dataset_val = valid_data.map(tokenize_function, batched=True)


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [None]:
# create custom collate fn for dataloaders
def custom_collate_fn(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    labels = [torch.tensor(item["label"]) for item in batch]

    # pad the sequences to the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)  # 0 as the padding value for attention_mask

    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_mask_padded,
        "label": torch.stack(labels)
    }


# create PyTorch DataLoaders using custom_collate_fn
train_dataloader = DataLoader(tokenized_dataset_train, shuffle=True, collate_fn=custom_collate_fn, batch_size=batch_size)
val_dataloader = DataLoader(tokenized_dataset_val, shuffle=False, collate_fn=custom_collate_fn, batch_size=batch_size)


In [None]:
# iterating through the first couple of training DataLoader batches
for batch_idx, batch in enumerate(train_dataloader):
    if batch_idx == 0:  # check the first batch
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        print("Unique Labels:", torch.unique(labels))

        # print relevant information about the batch
        print(f"Batch {batch_idx + 1}:")
        print("Input IDs:", input_ids)
        print("Attention Mask:", attention_mask)
        print("Labels:", labels)
        print("\n")
        break


Unique Labels: tensor([0, 1, 2])
Batch 1:
Input IDs: tensor([[     0,     62,   8753,  ...,      1,      1,      1],
        [     0,     62,  46667,  ...,      1,      1,      1],
        [     0,    581,   7158,  ...,      1,      1,      1],
        ...,
        [     0,     62,      6,  ...,      1,      1,      1],
        [     0,     62,  27150,  ...,      1,      1,      1],
        [     0,  32964, 119455,  ...,      1,      1,      1]])
Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Labels: tensor([1, 1, 2, 2, 2, 0, 0, 2, 1, 0, 1, 2, 0, 1, 1, 1, 1, 0, 2, 2, 2, 0, 0, 2,
        1, 2, 0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 2, 2, 0, 1, 1, 1, 0,
        0, 2, 2, 2, 1, 0, 2, 0, 2, 2, 0, 0, 1, 2, 0, 0])




Defining Neural Model

In [None]:
class NliModel(nn.Module):
    def __init__(self, num_labels):
        super(NliModel, self).__init__()
        # use roberta sequence classification model
        self.roberta = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        return self.roberta(input_ids, attention_mask=attention_mask).logits


training

In [None]:
# instantiate the model, tokenizer, and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NliModel(num_labels=3).to(device)

# Define the loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
wandb.finish

<function wandb.sdk.wandb_run.finish(exit_code: Optional[int] = None, quiet: Optional[bool] = None) -> None>

In [None]:
# initialize WandB
wandb.init(project="nli_a6", name="64_1e-5")

[34m[1mwandb[0m: Currently logged in as: [33myelintongji[0m ([33mcolia4[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# training loop
def train_epoch(model, train_dataloader, loss_function, optimizer, device):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for batch_idx, batch in enumerate(train_dataloader):
        # record start time for each batch
        batch_start_time = time.time()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)  # no need to change to logits
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        # record end time for each batch
        batch_end_time = time.time()

        total_loss += loss.item()
        _, predicted_labels = torch.max(outputs, 1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

        batch_elapsed_time = batch_end_time - batch_start_time

        # rint batch loss for every 300 batches
        if (batch_idx + 1) % 300 == 0:
            print(f"Batch {batch_idx + 1}/{len(train_dataloader)} => Batch Loss: {loss.item():.4f}, Batch Acc: {(predicted_labels == labels).sum().item() / labels.size(0):.4f}, Batch Time: {batch_elapsed_time:.2f} seconds")

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / total_samples

    return avg_loss, accuracy


In [None]:
# validation loop
def validate_epoch(model, val_dataloader, loss_function, device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for batch_idx, batch in enumerate(val_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs, labels)

        total_loss += loss.item()
        _, predicted_labels = torch.max(outputs, 1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

        # print and log batch loss and accuracy
        if (batch_idx + 1) % 100 == 0:
            batch_loss = total_loss / (batch_idx + 1)
            batch_accuracy = correct_predictions / total_samples
            print(f"Validation Batch {batch_idx + 1}/{len(val_dataloader)} => Batch Loss: {batch_loss:.4f}, Batch Acc: {batch_accuracy:.4f}")

    avg_loss = total_loss / len(val_dataloader)
    accuracy = correct_predictions / total_samples

    return avg_loss, accuracy


In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    # record start time
    start_time = time.time()

    # train
    train_loss, train_acc = train_epoch(model, train_dataloader, loss_function, optimizer, device)

    # log training metrics to WandB
    wandb.log({"epoch": epoch, "train_loss": train_loss, "train_acc": train_acc})

    # record end time
    end_time = time.time()

    # calculate elapsed time
    elapsed_time = end_time - start_time

    # print training progress with elapsed time
    print(f"Epoch {epoch + 1}/{num_epochs} => Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Elapsed Time: {elapsed_time:.2f} seconds")

    # validate
    val_loss, val_acc = validate_epoch(model, val_dataloader, loss_function, device)

    # log validation metrics to WandB
    wandb.log({"epoch": epoch, "val_loss": val_loss, "val_acc": val_acc})

    # print validation progress with elapsed time
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Elapsed Time: {elapsed_time:.2f} seconds")


Batch 300/8584 => Batch Loss: 0.8296, Batch Acc: 0.6719, Batch Time: 0.03 seconds
Batch 600/8584 => Batch Loss: 0.6486, Batch Acc: 0.7812, Batch Time: 0.03 seconds
Batch 900/8584 => Batch Loss: 0.5428, Batch Acc: 0.7656, Batch Time: 0.04 seconds
Batch 1200/8584 => Batch Loss: 0.4113, Batch Acc: 0.8281, Batch Time: 0.03 seconds
Batch 1500/8584 => Batch Loss: 0.4463, Batch Acc: 0.7656, Batch Time: 0.03 seconds
Batch 1800/8584 => Batch Loss: 0.5367, Batch Acc: 0.8125, Batch Time: 0.03 seconds
Batch 2100/8584 => Batch Loss: 0.5117, Batch Acc: 0.8125, Batch Time: 0.04 seconds
Batch 2400/8584 => Batch Loss: 0.6667, Batch Acc: 0.7031, Batch Time: 0.04 seconds
Batch 2700/8584 => Batch Loss: 0.4733, Batch Acc: 0.7969, Batch Time: 0.03 seconds
Batch 3000/8584 => Batch Loss: 0.3637, Batch Acc: 0.8906, Batch Time: 0.03 seconds
Batch 3300/8584 => Batch Loss: 0.3855, Batch Acc: 0.8594, Batch Time: 0.04 seconds
Batch 3600/8584 => Batch Loss: 0.5004, Batch Acc: 0.8438, Batch Time: 0.03 seconds
Batch 3

In [8]:
!pip freeze


absl-py==1.4.0
aiohttp==3.9.1
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
anyio==3.7.1
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array-record==0.5.0
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.14.0
backcall==0.2.0
beautifulsoup4==4.11.2
bidict==0.22.1
bigframes==0.19.1
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.3
bqplot==0.12.42
branca==0.7.0
build==1.0.3
CacheControl==0.13.1
cachetools==5.3.2
catalogue==2.0.10
certifi==2023.11.17
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.7
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.0
colorcet==3.0.1
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.4
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.0
cryptography==41.0.7
cufflinks==0.17.3
cupy-cuda12x==12.2.0
cvxopt==1.3.2
cvxpy==1.3.2
cycler==0.12.1
c