# Fine Tuning T5 for Customer Service

In [3]:
import os
import torch
import time
import numpy as np 
import pandas as pd
import tensorflow as tf
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, TrainingArguments, Trainer

Set configuration parameters

In [1]:
model_name='google/flan-t5-small'
# os.environ['TOKENIZERS_PARALLELISM'] = 'true'  

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = T5ForConditionalGeneration.from_pretrained(model_name)
base_model = base_model.to(device)

Load datasets

In [51]:
dataset_bitext_train = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train[:8%]")
dataset_bitext_test = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train[-2%:-1%]")
dataset_bitext_validation = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split="train[-1%:]")

Merge datasets

In [52]:
dataset_train_merged = concatenate_datasets(
    [
        dataset_bitext_train, 
        ]
    )
dataset_test_merged = concatenate_datasets(
    [
        dataset_bitext_test, 
        ]
    )
dataset_validation_merged = concatenate_datasets(
    [
        dataset_bitext_validation,
        ]
    )

Save datasets locally as `.csv` files

In [None]:
dataset_train_merged.to_csv('train_merged.csv', index=False)
dataset_test_merged.to_csv('test_merged.csv', index=False)
dataset_validation_merged.to_csv('validation_merged.csv', index=False)

Let's look at each dataset

In [54]:
dataset_train_merged

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 2150
})

In [55]:
dataset_test_merged

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 268
})

In [56]:
dataset_validation_merged

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 269
})

Load merged dataset from disk

In [7]:
dataset = load_dataset('csv', data_files={
    "train": "train_merged.csv", 
    "test": "test_merged.csv", 
    "validation": "validation_merged.csv"
    })

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 2150
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 268
    })
    validation: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 269
    })
})

In [15]:
dataset['train'][0]

{'flags': 'B',
 'instruction': 'question about cancelling order {{Order Number}}',
 'category': 'ORDER',
 'intent': 'cancel_order',
 'response': "I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you."}

Explore dataset

In [16]:
dataset_train = pd.read_csv('./train_merged.csv')

In [17]:
dataset_train.head()

<bound method NDFrame.head of       flags                                        instruction  category  \
0         B   question about cancelling order {{Order Number}}     ORDER   
1       BQZ  i have a question about cancelling oorder {{Or...     ORDER   
2      BLQZ    i need help cancelling puchase {{Order Number}}     ORDER   
3        BL         I need to cancel purchase {{Order Number}}     ORDER   
4     BCELN  I cannot afford this order, cancel purchase {{...     ORDER   
...     ...                                                ...       ...   
2145  BILPQ  would ugive me information about editing my de...  SHIPPING   
2146    BLM          I want supports trying to edit my address  SHIPPING   
2147     BL                          I can't change my address  SHIPPING   
2148    BLM  I need supports trying to update the shipping ...  SHIPPING   
2149    BLQ               i have to modify my shipping address  SHIPPING   

                       intent  \
0                cancel_

In [11]:
tokenizer = T5Tokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> Wie ich er bitten?</s>




Get instruction prompt using an index

In [13]:
def get_prompt(data, index):
    instruction = data.iloc[index]['instruction']
    response = data.iloc[index]['response']
    
    prompt = f'''
    instruction:
    {instruction}
    
    response:
    {response}
    
    '''
    
    return prompt

def get_instruction(data,index):
    instruction = data.iloc[index]['instruction']
    return f'''
    
    instruction:
    {instruction}
    
    response:
    
    '''

Check model output with one shot

In [18]:
test_prompt = get_prompt(dataset_train,0) + get_instruction(dataset_train,10)
print(test_prompt)


    instruction:
    question about cancelling order {{Order Number}}
    
    response:
    I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you.
    
    
    
    instruction:
    i dont know what to do to cancel order {{Order Number}}
    
    response:
    
    


In [19]:
input_text = tokenizer.encode(test_prompt,return_tensors="pt")
output = tokenizer.decode(base_model.generate(input_text)[0])
print(output)

<pad> I've understood you have a question regarding canceling order <unk> Order Number<unk>




Tokenize function

In [20]:
def tokenize_function(example):
    start_prompt = 'Instruction:\n'
    end_prompt = '\nResponse:'
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["instruction"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["response"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

In [21]:
shuffled_dataset = dataset.shuffle(seed=42)

In [22]:
tokenized_datasets = shuffled_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['flags', 'instruction', 'category', 'intent', 'response'])

Map:   0%|          | 0/2150 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2150
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 268
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 269
    })
})

Fine Tune

In [24]:
try:
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model")
    finetuned_model = finetuned_model.to('cpu')
    to_train = False

except:
    to_train = True
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    finetuned_model = finetuned_model.to('cpu')
    tokenizer = AutoTokenizer.from_pretrained(model_name)



In [26]:
%%time

if to_train:
    output_dir = 'training'

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=5e-3,
        num_train_epochs=2,
        per_device_train_batch_size=16,     # batch size per device during training
        per_device_eval_batch_size=16,      # batch size for evaluation
        weight_decay=0.01,
        logging_steps=50,
        evaluation_strategy='steps',        # evaluation strategy to adopt during training
        eval_steps=500,  
        no_cuda=True,                   # number of steps between evaluation
    )

    trainer = Trainer(
        model=finetuned_model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
    )
    
    trainer.train()
    
    finetuned_model.save_pretrained("model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjohnmosesng[0m ([33mjohnmosesng-axiis[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/270 [00:00<?, ?it/s]

{'loss': 2.2086, 'learning_rate': 0.004074074074074074, 'epoch': 0.37}
{'loss': 0.2743, 'learning_rate': 0.003148148148148148, 'epoch': 0.74}
{'loss': 0.2473, 'learning_rate': 0.0022222222222222222, 'epoch': 1.11}
{'loss': 0.2298, 'learning_rate': 0.0012962962962962963, 'epoch': 1.48}
{'loss': 0.2166, 'learning_rate': 0.00037037037037037035, 'epoch': 1.85}
{'train_runtime': 6722.4613, 'train_samples_per_second': 0.64, 'train_steps_per_second': 0.04, 'train_loss': 0.6038131413636384, 'epoch': 2.0}
CPU times: user 1h 26min 14s, sys: 1h 47min, total: 3h 13min 15s
Wall time: 1h 52min 3s


PEFT

In [70]:
from peft import LoraConfig, get_peft_model, TaskType

In [71]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [72]:
peft_model = get_peft_model(base_model, lora_config)
print(peft_model.print_trainable_parameters())

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093
None


In [73]:
import time

output_dir = 'training-peft'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    learning_rate=1e-3,
    num_train_epochs= 3, 
    no_cuda=True,  
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['test']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [74]:
from safetensors.torch import load_model, save_model

peft_trainer.train()
save_model(peft_model, "model.safetensors")

  0%|          | 0/1290 [00:00<?, ?it/s]

{'loss': 1.1595, 'learning_rate': 0.0006124031007751938, 'epoch': 1.16}


NotADirectoryError: [Errno 20] Not a directory: '/Users/johnmoses/miniforge3/envs/mforge39/lib/python3.9/site-packages/huggingface_hub-0.24.0-py3.8.egg/huggingface_hub/templates/modelcard_template.md'