## Dataset

In [1]:
from collections import Counter

import datasets

# "dictionary" mapping name of split (train/validation/test) to
# a Dataset for that split.
dataset_dict = datasets.load_dataset("tweet_eval", "emotion")

example_row = dataset_dict["train"][1]
print("Dataset overview:", dataset_dict)
print("Dataset features:", example_row.keys())
print("Example row:", example_row)
print()

label_distribution: dict[str, Counter] = {
    split_name: Counter([row["label"] for row in split_dataset])
    for split_name, split_dataset in dataset_dict.items()
}
num_label_classes = len(label_distribution["train"].keys())
print("Number of classes:", num_label_classes)
for split_name, split_label_distribution in label_distribution.items():
    print(f'Label distribution for "{split_name}" split:', split_label_distribution)

Downloading readme:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Dataset overview: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})
Dataset features: dict_keys(['text', 'label'])
Example row: {'text': "My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs", 'label': 0}

Number of classes: 4
Label distribution for "train" split: Counter({0: 1400, 3: 855, 1: 708, 2: 294})
Label distribution for "test" split: Counter({0: 558, 3: 382, 1: 358, 2: 123})
Label distribution for "validation" split: Counter({0: 160, 1: 97, 3: 89, 2: 28})


## Tokenizer and Model

In [2]:
import os.path

# Use shared copy of the model if running this example on the Vector cluster.
# Otherwise, download model from HuggingFace.
base_model_repo = "facebook/opt-350m"
local_model_path = "/projects/fta_bootcamp/downloads/opt-350m/"
if os.path.isdir(local_model_path):
    base_model_repo = local_model_path

print(f"Loading pretrained model and tokenizer from {base_model_repo}")

Loading pretrained model and tokenizer from /projects/fta_bootcamp/downloads/opt-350m/


In [3]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_repo)

### Hyperparameters

In [4]:
# Number of examples presented to the model in a given step.
BATCH_SIZE = 8
LEARNING_RATE = 1e-5

## Build training and evaluation batches

Combining more than one rows of data into a "batch" helps increase throughput on GPUs, both for training and for evaluation.

In [5]:
from typing import Any

from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerBase


def build_data_batches(
    dataset_rows: list[dict[str, Any]],
    batch_size: int,
    tokenizer: PreTrainedTokenizerBase,
    text_column_name: str = "text",
    label_column_name: str = "label",
) -> list[dict[str, torch.TensorType]]:
    """Build batches out of a list of examples.

    For simplicity, if some trailing examples don't fit in a batch,
    those examples would not be included in the output.

    Params
    ------
        dataset_rows: List of dictionaries, one for each row of the dataset.
        batch_size: Number of examples to include in each batch.
        tokenizer: HuggingFace Tokenizer
        text_column_name: name of text column in dataset_rows
        label_column_name: name of label column in dataset_rows

    Returns
    -------
        list of dictionaries, one for each batch.
        Each dictionary consists of:
        - input_ids: integer tensors of shape (batch_size, max_width)
        - attention_mask: boolean tensor of same shape as input_ids,
            highlighting which of the items in the input_ids tensor are paddings
            (not actual words, but added for performance reasons)
        - labels: tensor of shape (batch_size,) one per row.

        Values of the dictionary are tensors.

    """
    # list of batches. Text would be replaced with tokenization tensors,
    # while labels would be stored as PyTorch Tensors (lists)
    output: list[dict[str, torch.TensorType]] = []

    # Buffer for a batch of dataset rows, not yet tokenized.
    text_buffer: list[str] = []
    label_buffer: list[Any] = []
    num_examples_in_buffer = 0

    for row in tqdm(dataset_rows):
        # TODO: Add text from the dataset "row" to the text buffer.
        # Recall that "row" is a dictionary mapping dataset feature
        # to value. The name of the text feature is in the "text_column_name"
        # variable.
        #
        # Hint:
        #   text_buffer.append(...)
        text_buffer.append(row[text_column_name])
        label_buffer.append(row[label_column_name])

        num_examples_in_buffer += 1

        # Group (batch_size) raw dataset rows into one processed batch of tensors.
        if num_examples_in_buffer == batch_size:
            print(len(text_buffer))
            print(len(label_buffer))
            assert len(text_buffer) == batch_size
            assert len(label_buffer) == batch_size

            output.append(
                {
                    **tokenizer(text_buffer, return_tensors="pt", padding=True),
                    "labels": torch.Tensor(label_buffer).type(torch.long),
                },
            )
            text_buffer = []
            label_buffer = []
            num_examples_in_buffer = 0

    return output

In [6]:
processed_dataset_dict = {
    split_name: build_data_batches(dataset_split, BATCH_SIZE, tokenizer)
    for split_name, dataset_split in dataset_dict.items()
}

print(
    "processed batches:",
    {split_name: len(split) for split_name, split in processed_dataset_dict.items()},
)

  0%|          | 0/3257 [00:00<?, ?it/s]

8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


  0%|          | 0/1421 [00:00<?, ?it/s]

8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8


  0%|          | 0/374 [00:00<?, ?it/s]

8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
processed batches: {'train': 407, 'test': 177, 'validation': 46}


In [7]:
example_batch = processed_dataset_dict["train"][0]
print("Example batch:", {k: v.shape for k, v in example_batch.items()})

Example batch: {'input_ids': torch.Size([8, 33]), 'attention_mask': torch.Size([8, 33]), 'labels': torch.Size([8])}


In [14]:
def evaluate_model(
    model: torch.nn.Module, processed_dataset: list[dict[str, torch.Tensor]],
) -> tuple[float, float]:
    """Evaluate model on given dataset.

    See above for a demo of the inner working of this function.

    Params:
    -------
        model: transformer classifier model to evaluate.
        processed_dataset: list of pre-processed (tokenized) batches.

    Returns
    -------
        (cross-entropy loss, accuracy)

    """
    criteria = torch.nn.CrossEntropyLoss()
    loss_values: list[torch.Tensor] = []
    accuracy_values: list[torch.Tensor] = []

    with tqdm(total=len(processed_dataset)) as progress_bar:
        for batch_cpu in processed_dataset:
            batch = {k: v.to(model.device) for k, v in batch_cpu.items()}
            logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
            ).logits
            labels = batch["labels"]

            loss = criteria(logits, labels).item()
            loss_values.append(loss)

            predictions = torch.argmax(logits, dim=-1)
            accuracy = torch.mean((predictions == labels).type(torch.float)).item()
            accuracy_values.append(accuracy)

            progress_bar.update(1)

        avg_loss = sum(loss_values) / len(loss_values)
        avg_accuracy = sum(accuracy_values) / len(accuracy_values)

        progress_bar.set_description(
            f"Eval loss {avg_loss:.3f} acc {avg_accuracy * 100:.1f}%",
        )

    return avg_loss, avg_accuracy

In [15]:
from transformers import AutoModelForSequenceClassification

device = torch.device(0)
torch.manual_seed(0)

train_data = processed_dataset_dict["train"]
validation_data = processed_dataset_dict["validation"]

print(
    "Adding a new classification layer on top of pretrained weights- \n"
    "you will see a reminder from HuggingFace that "
    '"You should probably TRAIN this model":',
)
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_repo, num_labels=num_label_classes,
)
model = model.to(device)

criteria = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Train for 100 steps
for train_step, batch_cpu in enumerate(tqdm(processed_dataset_dict["train"][:100])):
    # evaluate model on validation set every 10 steps
    if train_step % 10 == 0:
        model.eval()  # turn off back-propagation to evaluate faster
        eval_accuracy, eval_loss = evaluate_model(model, validation_data)
        model.train()  # turn back-propagation back on

    # Send batch to accelerator device
    batch = {k: v.to(device) for k, v in batch_cpu.items()}
    model_output = model(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
    )

    training_loss = criteria(model_output.logits, batch["labels"])

    optimizer.zero_grad()
    training_loss.backward()
    optimizer.step()

Adding a new classification layer on top of pretrained weights- 
you will see a reminder from HuggingFace that "You should probably TRAIN this model":


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at /projects/fta_bootcamp/downloads/opt-350m/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

In [16]:
model.eval()
eval_loss, eval_accuracy = evaluate_model(model, validation_data)

print(f"Final validation loss: {eval_loss:.2f}; accuracy {eval_accuracy * 100:.1f}%")

  0%|          | 0/46 [00:00<?, ?it/s]

Final validation loss: 0.73; accuracy 73.6%


In [18]:
# labels for tweet_eval emotion:
# 0: anger; 1: joy; 2: optimism; 3: sadness
example_input = ["Good book!", "Bad book!", "It is raining all day!"]
example_input_encoded = tokenizer(example_input, return_tensors="pt", padding=True)
model_output = model(
    input_ids=example_input_encoded["input_ids"].to(device),
    attention_mask=example_input_encoded["attention_mask"].to(device),
)

predictions = torch.argmax(model_output.logits, dim=-1)
print("predictions:", predictions)

predictions: tensor([1, 0, 3], device='cuda:0')


## Trainer Integration

HuggingFace provides abstractions for common use cases- including sequence classification. 

Reproduced from [transformers/en/tasks/sequence_classification](https://huggingface.co/docs/transformers/en/tasks/sequence_classification) with simplifications.

In [12]:
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

device = torch.device(0)
torch.manual_seed(0)

print(
    "Adding a new classification layer on top of pretrained weights- \n"
    "you will see a reminder from HuggingFace that "
    '"You should probably TRAIN this model":',
)
tokenizer = AutoTokenizer.from_pretrained(base_model_repo)
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_repo, num_labels=num_label_classes,
)
model = model.to(device)

tokenized_dataset = dataset_dict.map(
    lambda examples: tokenizer(
        examples["text"]
        # TODO: The input to tokenizer() should be a list of text (list[str]).
        # each `examples` is a dict[str, list[Any]], mapping the name
        # of the column of the dataset to a list of values from that column.
        # Which column should you tokenize?
        #
        # hint: `examples["name_of_column"]`
    ),
    batched=True,
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # TODO: How would you obtain the predictions from the logits tensor?
    # Hint: using numpy, take argmax over the dimension (-1).
    #
    # If "logits" is a 2D array of shape (batch, num_choices),
    # Your "predictions" should be a 1D array of shape (batch,)
    #
    # predictions = ...
    predictions = np.argmax(logits, axis=-1)
    print(predictions.shape)
    assert isinstance(predictions, (np.ndarray, torch.Tensor))
    assert predictions.shape == (logits.shape[0],)

    return accuracy.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="../../scratch/supervised_finetuning/checkpoints",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=0.25,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Adding a new classification layer on top of pretrained weights- 
you will see a reminder from HuggingFace that "You should probably TRAIN this model":


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at /projects/fta_bootcamp/downloads/opt-350m/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Accuracy
10,No log,1.547266,0.419786
20,No log,1.367585,0.483957
30,No log,1.283064,0.508021
40,No log,1.277318,0.505348
50,No log,1.140942,0.545455
60,No log,1.095202,0.55615
70,No log,1.019637,0.574866
80,No log,0.9316,0.622995
90,No log,0.910769,0.649733
100,No log,0.91557,0.644385


(374,)
(374,)
(374,)
(374,)
(374,)
(374,)
(374,)
(374,)
(374,)
(374,)


TrainOutput(global_step=102, training_loss=1.2115893644445084, metrics={'train_runtime': 61.4227, 'train_samples_per_second': 13.256, 'train_steps_per_second': 1.661, 'total_flos': 55682334720000.0, 'train_loss': 1.2115893644445084, 'epoch': 0.25})

In [13]:
model = trainer.model
model.eval()
eval_loss, eval_accuracy = evaluate_model(model, validation_data)

print(f"Final validation loss: {eval_loss:.2f}; accuracy {eval_accuracy * 100:.1f}%")

# labels for tweet_eval emotion:
# 0: anger; 1: joy; 2: optimism; 3: sadness
example_input = ["Good book!", "Bad book!", "It is raining all day!"]
example_input_encoded = tokenizer(example_input, return_tensors="pt", padding=True)
model_output = model(
    input_ids=example_input_encoded["input_ids"].to(device),
    attention_mask=example_input_encoded["attention_mask"].to(device),
)

predictions = torch.argmax(model_output.logits, dim=-1)
print("predictions:", predictions)

  0%|          | 0/46 [00:00<?, ?it/s]

Final validation loss: 0.92; accuracy 64.4%
predictions: tensor([1, 0, 3], device='cuda:0')
