https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb

In [1]:
!pip install transformers datasets
!pip install pytorch_lightning
!pip install apex

Collecting apex
  Downloading apex-0.9.10dev.tar.gz (36 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting cryptacular (from apex)
  Downloading cryptacular-1.6.2.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.8/75.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25l- \ done
[?25hCollecting zope.sqlalchemy (from apex)
  Downloading zope.sqlalchemy-3.1-py3-none-any.whl.metadata (18 kB)
Collecting velruse>=1.0.3 (from apex)
  Downloading velruse-1.1.1.tar.gz (709 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.8/709.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting pyramid>1.1.2 (from apex)
  Downloading pyramid-

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForSequenceClassification,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
     

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [4]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [5]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.save_hyperparameters(hparams)

        self.model = T5ForSequenceClassification.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return self.global_rank <= 0

    def forward(
        self, input_ids, attention_mask=None, labels=None
        ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        
        outputs = self(
            input_ids=batch[0],
            attention_mask=batch[1],
            labels=batch[2],
        )

        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        tensorboard_logs = {"train_loss": loss}
        self.training_outputs.append({"loss": loss})  # Append to outputs
        
        # Clear cache after each batch 
        torch.cuda.empty_cache()
        
        return {"loss": loss, "log": tensorboard_logs}

    def on_train_epoch_start(self):
        self.training_outputs = []  # Reset at the start of each epoch
    
    
    def on_train_epoch_end(self):
        # Ensure there are outputs to process
        if not self.training_outputs:
            return
        
        # Compute average loss from accumulated outputs
        avg_train_loss = torch.stack([x['loss'] for x in self.training_outputs]).mean()
        
        # Log metrics
        self.log('avg_train_loss', avg_train_loss, on_epoch=True, prog_bar=True, logger=True)
        
        # Clear the outputs list
        self.training_outputs.clear()
        
        # Clear the cache to manage GPU memory
        torch.cuda.empty_cache()


    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.validation_outputs.append({'loss': loss})  # Append each step's output
        self.log('val_loss', loss, prog_bar=True, logger=True)
        
        # Clear cache after each batch 
        torch.cuda.empty_cache()
        
        return {"val_loss": loss}
    
    def on_validation_epoch_start(self):
        self.validation_outputs = []  # Reset at the start of the epoch

    def on_validation_epoch_end(self):
        if not self.validation_outputs:
            return

        avg_val_loss = torch.stack([x['loss'] for x in self.validation_outputs]).mean()
        self.log('avg_val_loss', avg_val_loss, on_epoch=True, prog_bar=True, logger=True)
        self.validation_outputs.clear()
        
        # Clear the cache to manage GPU memory
        torch.cuda.empty_cache()

    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer

        # Set up learning rate scheduler
        t_total = (
            (len(self.train_dataloader().dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        
        return [optimizer], [scheduler]

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        torch.cuda.empty_cache()
    
        train_dataset = prepare_dataset(x_train, y_train, self.tokenizer)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
        return dataloader

    def val_dataloader(self):
        val_dataset = prepare_dataset(x_val, y_val, self.tokenizer)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [6]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
          # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

          # Log and save results to file
        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))
                    writer.write("{} = {}\n".format(key, str(metrics[key])))

In [7]:

args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path="unicamp-dl/ptt5-base-portuguese-vocab",
    tokenizer_name_or_path="unicamp-dl/ptt5-base-portuguese-vocab",
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=5,
    gradient_accumulation_steps=8,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [8]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1
from sklearn.model_selection import train_test_split

x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [9]:
from torch.utils.data import TensorDataset
import torch
import numpy as np
from transformers import T5Tokenizer

def tokenize_corpus(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for doc in df.tolist():
        # Tokenizing input text
        encoded_dict = tokenizer.encode_plus(
                            doc,
                            add_special_tokens=True,
                            max_length=max_len,
                            truncation=True,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt'
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

def prepare_dataset(features, labels, tokenizer):
    # Tokenize the input texts
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df


In [10]:

# args_dict.update({'data_dir': 'aclImdb', 'output_dir': 't5_imdb_sentiment', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename='best-checkpoint', monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    devices=args.n_gpu,
    accelerator= 'gpu',
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    gradient_clip_val=args.max_grad_norm,
    callbacks=[LoggingCallback(), checkpoint_callback],
)
  

In [11]:
model = T5FineTuner(args)

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at unicamp-dl/ptt5-base-portuguese-vocab and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/756k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
trainer = pl.Trainer(**train_params)
torch.cuda.empty_cache()

In [13]:
trainer.fit(model)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory  exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [14]:
model.model.save_pretrained('../models/ptt5-fine-tuned')

Run in the test set

In [15]:
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/ptt5-base-portuguese-vocab")

test_dataset = prepare_dataset(x_test, y_test, tokenizer)
test_dataloader =  DataLoader(test_dataset, batch_size=8)

In [16]:
from sklearn import metrics
def compute_metrics(y_true, y_pred):
    f1 = metrics.f1_score(y_true, y_pred, zero_division = 1)       
    recall = metrics.recall_score(y_true, y_pred,zero_division = 1)
    precision = metrics.precision_score(y_true, y_pred, zero_division = 1)
    acc = metrics.accuracy_score(y_true, y_pred)
    
    return f1, recall, precision, acc

In [17]:
import torch.nn.functional as F

# Initialize lists to store predictions and true labels
y_pred = []
y_true = []
probabilities = []

# Set the model to evaluation mode
model.eval()

# Disable gradient calculation
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        
        # Get model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Get the predicted class indices
        preds = torch.argmax(logits, dim=-1)
        probs = logits.softmax(dim=-1)[:, 1]
        
        probabilities.extend(probs.cpu().numpy())
        
        # Convert to lists and extend the results
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())
        

# Compute metrics
f1, recall, precision, acc = compute_metrics(y_true, y_pred)
print("")
print("Summary Resuts")
print(" avg test acc | recall | f1 | precision | roc ")
f1, recall, precision, acc = compute_metrics(y_true, y_pred)
roc = metrics.roc_auc_score(y_true, probabilities, average='macro')
print(f"{acc:.5f} | {recall:.5f} | {f1:.5f} | {precision:.5f} | {roc:.5f}")



Summary Resuts
 avg test acc | recall | f1 | precision | roc 
0.88857 | 0.82622 | 0.87419 | 0.92808 | 0.96641
