# Fine tuning T5 with Layer

[![Open In Layer](https://app.layer.ai/assets/badge.svg)](https://app.layer.ai/layer/named-entity-recognition) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Kx4-BpHaIjp5C6AkHW_T2jZM_Azy0xzk#scrollTo=hbl_5GrpV_Qd)

A T5 is an encoder-decoder model. It converts all NLP problems like language translation, summarization, text generation, question-answering, to a text-to-text task.

We are going to fine tune a pretrained T5 Model and train it to do named entity recognition.

# Install Requirements

In [None]:
!pip install ipython ipykernel --upgrade -q
!pip install layer-sdk --upgrade -q
!pip install sentencepiece -q
!pip install transformers -q

# Getting Started with Layer

Layer is an MLOps platform which advances ML pipelines with remote computation and tracking.

## Login to Layer

Let's login to Layer first.

In [None]:
import layer
layer.login()

## Initialize Layer Project
Now we are ready to init our project. A Layer Project is basically an ML Repo hosted on Layer where you can store your datasets, models and metrics.

In [None]:
layer.init("named-entity-recognition")

Project(name='named-entity-recognition', raw_datasets=[], derived_datasets=[], featuresets=[], models=[], path=PosixPath('.'), project_files_hash='', readme='', organization=Organization(id=UUID('d7325da3-0646-4fa6-855d-8d19eece8b79'), name='layer'), _id=UUID('7653821a-53c6-44b9-8b91-248e6a53f225'), functions=[])

Your project is ready. Find your project here:

https://app.layer.ai/

# Upload training data
Fetch CoNLL 2003 data, convert it into a format for T5 and save it to the Layer backend.

In [None]:
import json
import pandas as pd

from layer.decorators import dataset
from layer.client import Dataset

def transform_raw_dataset(input_df):
  rows = []
  for index, row in input_df.iterrows():
    tokens = json.loads(row['tokens'])
    ner_tags = json.loads(row['ner_tags'])

    ner_tokens = []
    for token, ner_tag in zip(tokens, ner_tags):
      if ner_tag == 'O':
        ner_tokens.append(token)
      else:
        ner_tokens.append(f'{token}|{ner_tag}')

    rows.append([' '.join(tokens), ' '.join(ner_tokens)])
    
  df = pd.DataFrame(rows, columns=["sentence", "entities"])
  return df


for DATA_TYPE in ['train', 'validation', 'test']:
  @dataset(f"t5_ner_{DATA_TYPE}_data", dependencies=[Dataset(f"layer/conll2003/datasets/{DATA_TYPE}")])
  def build_dataset():
    return transform_raw_dataset(layer.get_dataset(f"layer/conll2003/datasets/{DATA_TYPE}").to_pandas())
  build_dataset()


Output()

Output()

Output()

# Fine Tune T5

Our dataset is ready and registered to Layer. Now we are going to develop the fine tuning logic, decorate the function with `@model` and pass it to Layer so that it can be run on Layer infra and registered under our project

In [None]:
import torch
from layer.client import Dataset


class NERDataSet(Dataset):

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

    self.data["sentence"] = "recognize named entities: "+self.data["sentence"]
    self.data["entities"] = "<pad>" + self.data["entities"] + "</s>"

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long),
        'source_mask': source_mask.to(dtype=torch.long),
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
  import torch
  
  model.train()
  for _,data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
    loss = outputs[0]

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Here we use 3 seperate Layer decorators:
- [`@model`](https://docs.app.layer.ai/docs/sdk-library/model-decorator): Tells Layer that this function trains an ML model
- [`@fabric`](https://docs.app.layer.ai/docs/sdk-library/fabric-decorator): Tells Layer the computation resources (cpu, gpu etc.) needed to train the model. Here is a list of the [available fabrics](https://docs.development.layer.co/docs/reference/fabrics) you can use.
- [`@pip_requirements`](https://docs.app.layer.ai/docs/sdk-library/pip-requirements-decorator): Tells the pypi libraries needed to train the model.

In [None]:
from layer.decorators import model, pip_requirements, fabric
from layer.client import Dataset, Model

MODEL_PARAMS={
    "MODEL":"t5-small",            
    "TRAIN_BATCH_SIZE":8,          
    "VALID_BATCH_SIZE":8,          
    "TRAIN_EPOCHS":3,              
    "VAL_EPOCHS":3,
    "LEARNING_RATE":1e-4,          
    "MAX_SOURCE_TEXT_LENGTH":75,   
    "MAX_TARGET_TEXT_LENGTH":75, 
    "MAX_TRAINING_ROWS":2000,
    "SEED": 33
}

@model("t5-named-entity-recognition", 
       dependencies=[
                     Model(f"layer/t5/models/{MODEL_PARAMS['MODEL']}"),
                     Model(f"layer/t5/models/{MODEL_PARAMS['MODEL']}-tokenizer"),
                     Dataset("t5_ner_train_data"),
                     Dataset("t5_ner_validation_data"),
                     Dataset("t5_ner_test_data"),
                     ])
@fabric("f-medium")
@pip_requirements(packages=["numpy", "torch", "transformers", "sentencepiece"])
def build_model():
  import numpy as np
  import torch
  import torch.nn.functional as F
  from torch import cuda
  from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


  # # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(MODEL_PARAMS["SEED"]) # pytorch random seed
  np.random.seed(MODEL_PARAMS["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # Setting up the device for GPU usage
  device = 'cuda' if cuda.is_available() else 'cpu'

  # Load pretrained model from Layer
  print(f"""[Model]: Loading model {MODEL_PARAMS['MODEL']} for device {device}...\n""")

  model = layer.get_model(f"layer/t5/models/{MODEL_PARAMS['MODEL']}").get_train()
  model.to(device)

  # Load tokenizer
  tokenizer = layer.get_model(f"layer/t5/models/{MODEL_PARAMS['MODEL']}-tokenizer").get_train()

  # Read data
  print(f"[Data]: Reading data...\n")
  
  train_df = layer.get_dataset("t5_ner_train_data").to_pandas().head(MODEL_PARAMS['MAX_TRAINING_ROWS'])
  valid_df = layer.get_dataset("t5_ner_validation_data").to_pandas().head(MODEL_PARAMS['MAX_TRAINING_ROWS']//5)
  test_df = layer.get_dataset("t5_ner_test_data").to_pandas().head(MODEL_PARAMS['MAX_TRAINING_ROWS']//5)
  source_text = "sentence"
  target_text = "entities"

  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
  print(f"TRAIN Dataset: {train_df.shape}")
  print(f"VALID Dataset: {valid_df.shape}")
  print(f"TEST Dataset: {test_df.shape}")

  # Creating the Training and Validation dataset for further creation of Dataloader
  train_set = NERDataSet(train_df, tokenizer, MODEL_PARAMS["MAX_SOURCE_TEXT_LENGTH"], MODEL_PARAMS["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  valid_set = NERDataSet(valid_df, tokenizer, MODEL_PARAMS["MAX_SOURCE_TEXT_LENGTH"], MODEL_PARAMS["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': MODEL_PARAMS["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }

  valid_params = {
      'batch_size': MODEL_PARAMS["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(train_set, **train_params)
  val_loader = DataLoader(valid_set, **valid_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params =  model.parameters(), lr=MODEL_PARAMS["LEARNING_RATE"])


  # Training loop
  print(f'[Initiating Fine Tuning]...\n')

  for epoch in range(MODEL_PARAMS["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)

  print(f"[Saving Model]...\n")

  return model

build_model()


Output()

[Model]: Loading model t5-small for device cuda...

[Data]: Reading data...

TRAIN Dataset: (2000, 2)
VALID Dataset: (400, 2)
TEST Dataset: (400, 2)
[Initiating Fine Tuning]...



  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


[Saving Model]...

