<a href="https://colab.research.google.com/github/jonkrohn/NLP-with-LLMs/blob/main/code/Single-GPU-T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Single GPU T5

In this notebook, we use a single GPU in conjunction with Hugging Face and PyTorch Lightning to train an LLM (a T5 architecture) to be able to convert integers (e.g., `2003`) into their corresponding string (e.g., `two thousand three`).

### Load dependencies

In [1]:
%%capture
!pip install nvidia-ml-py3==7.352.0 pytorch-lightning==2.0.1.post0 transformers==4.28.0 torchvision==0.15.1 rouge-score==0.1.2 tensorboardx==2.6 accelerate==0.18.0 deepspeed==0.9.0 peft==0.2.0

In [2]:
# PyTorch autodiff library:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# PyTorch Lightning for easier training:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

# Hugging Face for model, tokenizer, and a training function:
from transformers import T5ForConditionalGeneration
from transformers import AutoTokenizer
from transformers import get_linear_schedule_with_warmup

# LoRA for PEFT:
from peft import get_peft_model, LoraConfig, TaskType

# Filesystem/data utilities:
import os
import json

### Import helper functions

In [3]:
!git clone https://github.com/shaankhosla/NLP_with_LLMs/
%cd "NLP_with_LLMs"

fatal: destination path 'NLP_with_LLMs' already exists and is not an empty directory.
/content/NLP_with_LLMs


In [4]:
import gpu_utilities, generate_data

### Select a model

You can read more about the T5 family of models, including options for other T5 model versions in Hugging Face [here](https://huggingface.co/docs/transformers/model_doc/t5).

`T5-small` has just 60 million parameters, making it a great choice for demo purposes.

In [5]:
MODEL_NAME = 't5-small'

**Return to "T5" slide here.**

### Generate Data

In [6]:
generate_data.main(num_train=1000, num_val=200)

Generating synthetic dataset (1000 train, 200 val)...




In [7]:
!cat './data/train/0.json'

{"number": 364, "words": "three hundred sixty four"}

In [8]:
!cat './data/train/1.json'

{"number": 72680, "words": "seventy two thousand six hundred eighty"}

In [9]:
class StreamingDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.tokenizer = AutoTokenizer.from_pretrained( # automatically detects T5 and tokenizes for T5
            MODEL_NAME, cache_dir='./cache/', use_fast=True
        )

    def __len__(self):
        return len(os.listdir(self.path))

    def encode_text(self, context, text):
        ctext = str(context)
        ctext = " ".join(ctext.split())
        text = str(text)
        text = " ".join(text.split())
        source = self.tokenizer.batch_encode_plus(
            [ctext],
            max_length=16,
            truncation=True,
            # pad_to_max_length=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [text],
            max_length=16,
            truncation=True,
            # pad_to_max_length=True,
            padding="max_length",
            return_tensors="pt",
        )
        y = target["input_ids"]
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[
            y[:, 1:] == self.tokenizer.pad_token_id
        ] = -100  # in case the labels are not provided, empty string
        return source["input_ids"], source["attention_mask"], target_id, target_label

    def __getitem__(self, idx):
        file_path = os.path.join(self.path, str(idx) + ".json")
        with open(file_path, "r") as infile:
            data = json.load(infile)
        number, words = str(data["number"]), data["words"]
        return self.encode_text(number, words)

In [10]:
train_data = StreamingDataset('./data/train/')
val_data = StreamingDataset('./data/val')

`train_data[0]` calls `StreamingDataset.__getitem__()` so this is the first time that data are being read from storage and encoded: 

In [11]:
input_ids_i, sequence_mask_i, target_ids_i, target_label_i = train_data[0]

In [12]:
input_ids_i

tensor([[ 220, 4389,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])

In [13]:
sequence_mask_i

tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [14]:
target_ids_i

tensor([[  386,  6189, 27757,   662,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]])

`target_label` represents the correct token that the autoregressive model should be predicting and `-100` is mask token:

In [15]:
target_label_i

tensor([[ 6189, 27757,   662,     1,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100]])

In [16]:
# Deconstruct individual-data-point tuples and collates into batch variables:
def collate_fn(batch):
    input_ids = torch.stack([torch.flatten(x[0]) for x in batch])
    sequence_mask = torch.stack([torch.flatten(x[1]) for x in batch])
    target_ids = torch.stack([torch.flatten(x[2]) for x in batch])
    target_label = torch.stack([torch.flatten(x[3]) for x in batch])
    return input_ids, sequence_mask, target_ids, target_label

### Set up Lightning training module

In [17]:
class T5Finetuner(pl.LightningModule):
    def __init__(self, model, args, train_data, val_data):
        super().__init__()
        self.model = model
        self.args = args
        self.tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME, cache_dir='./cache/', use_fast=True
        )
        self.train_data, self.val_data = train_data, val_data

    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch
        return self.model(
            input_ids=source_ids,
            attention_mask=source_mask,
            decoder_input_ids=target_ids,
            labels=target_labels,
        )

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0] # self() is forward() in Lightning
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx): # Lightning knows no backprop in validation
        loss = self(batch, batch_idx)[0]
        return {'loss': loss}

    # data loaders provide clever optimizations like pre-fetching next-needed training data point on additional worker
    def train_dataloader(self):
        return DataLoader(
            self.train_data,
            batch_size=self.args['batch_size'],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=collate_fn,
            prefetch_factor=128, # number of samples to prefetch
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_data,
            batch_size=self.args['batch_size'],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=collate_fn,
            prefetch_factor=128, 
        )

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.trainer.model.parameters(), lr=self.args['lr'], weight_decay=0.01
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=5, # initial, slow LR steps can "warm up" attention
            num_training_steps=self.args['epochs']
            * len(self.train_data)
            / self.args['batch_size'],
        )
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

# TO DO

* add calculation of loss across all validation data

### Vanilla Training

In [18]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)
trainer = pl.Trainer(
    max_epochs=args['epochs'],
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 395 MB.


### Predict

In [19]:
!cat './data/val/0.json'

{"number": -83, "words": "negative eighty three"}

In [20]:
val_ids, val_mask, _, _ = val_data[0]

In [21]:
val_ids

tensor([[   3,   18, 4591,    1,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])

In [22]:
val_mask

tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [23]:
generated_ids = pl_model.model.generate(input_ids=val_ids, attention_mask=val_mask, max_new_tokens=16)

In [24]:
generated_ids

tensor([[   0,    3,   18, 4591,    1]])

In [25]:
prediction =  [
              pl_model.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
              for g in generated_ids
              ]

In [26]:
prediction

['-83']

# TO DO

* Calculate loss across all validation points again to see decrease.

Check out the Colab *Resources* tab, which shows GPU memory usage. With:
* Larger model
* Larger batch size
* Larger input/target data

...GPU memory (which is often our scarcest resource when training deep learning models) could quickly run out.

We could splurge on more GPUs or we could be more clever about how we train. More on that coming up...

**Return to slides here.**

### Gradient Checkpointing

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)
trainer = pl.Trainer(
    max_epochs=args['epochs'],
)

trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

### Gradient Accumulation

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

### Mixed Precision

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

### PEFT with LoRA
 
Parameter-Efficient Fine-Tuning with Low-Rank Adaptation

In [None]:
hg_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir='./cache/')
hg_model.enable_input_require_grads()
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
hg_model = get_peft_model(hg_model, peft_config)
hg_model.print_trainable_parameters()

In [None]:
hg_model.gradient_checkpointing_enable()

args = {'batch_size': 4, 'epochs': 1, 'lr': 1e-4}
pl_model = T5Finetuner(hg_model, args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args['epochs'],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()