<a href="https://colab.research.google.com/github/jayc279/GenAI_LLM/blob/main/pre_trained_fine_tuned/LoRA_pretrain_google_flan_t5_small_samsum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PreTraining and FineTuning 'Google/Flan-T5-small' against 'samsum' dataset

In [1]:
PYDEVD_DISABLE_FILE_VALIDATION = 1

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%load_ext autoreload
%autoreload

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

<h3>Import Packages</h3>

In [5]:
list_of_packges = ['datasets',
                   'torch',
                   'torchdata',
                   'transformers',
                   'evaluate',
                   'rouge_score',
                   'peft',
                   'bitsandbytes',
                   'sentencepiece',
                   'accelerate',
                   'bert_score',
                   'trl']

import os
with open('requirements_colab_hf.txt','w') as f:
  for ip in list_of_packges:
    f.write(ip)
    f.write(os.linesep)
  f.close()

!pip install -r 'requirements_colab_hf.txt'

Collecting datasets (from -r requirements_colab_hf.txt (line 1))
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from -r requirements_colab_hf.txt (line 5))
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score (from -r requirements_colab_hf.txt (line 6))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft (from -r requirements_colab_hf.txt (line 7))
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes (from -r requirements_colab_hf.txt (line 8))
  Downloading bitsandbytes-0.43.

In [6]:
# HF_TOKEN=os.environ.get('HF_WRITE')
# print(HF_TOKEN)
# from huggingface_hub import notebook_login
# notebook_login()

In [7]:
## HF collect (or care about) any personally identifiable information
# from transformers.utils import send_example_telemetry
# send_example_telemetry("image_captioning_blip_notebook", framework="pytorch")

In [8]:
import torch
## Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"device: {device}")
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"ID of current CUDA device: {cuda_id}")
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

device: cuda
Is CUDA supported by this system? True
CUDA version: 12.1
ID of current CUDA device: 0
Name of current CUDA device: Tesla T4


### Load the training and test datasets to be fine-tuned with the Tokenizer

In [9]:
# Load image captioning dataset
from datasets import load_dataset
from datasets import load_metric
dataset_id = "samsum"   # another dataset - yahoo_answers_qa

In [10]:
# download dataset
dataset=load_dataset(dataset_id)

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

### Data Exploration

In [11]:
print(dataset.column_names)
print(dataset.keys())

{'train': ['id', 'dialogue', 'summary'], 'test': ['id', 'dialogue', 'summary'], 'validation': ['id', 'dialogue', 'summary']}
dict_keys(['train', 'test', 'validation'])


In [12]:
## get each dataset into separate sets and continue
train_set = dataset['train']
test_set = dataset['test']
val_set = dataset['validation']
print(train_set.shape, test_set.shape, val_set.shape)

(14732, 3) (819, 3) (818, 3)


In [13]:
train_set[0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

### load google/flan-t5-small and pretrain / finetune

In [14]:
# load model
from transformers import AutoModelForSeq2SeqLM
model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### tokenizer settings

In [15]:
# tokenizer settings
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# setting PADDING instructions for Tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

### prompt instructions for this dataset - instructions change with set used

In [16]:
# Columns  - ['id', 'dialogue', 'summary']
# prompt instructions for this dataset - instructions change with set used
def prompt_template(sample):
  """Add prefix to each sentence, tokenize and set label"""
  return f"""
  ### Instruction:
  Use the Task below and the Input given to write the Response

  ### Task:
  Please Summarize the dialogue

  ### Input:
  {sample['dialogue']}

  ### Response:
  {sample['summary']}
  """


In [18]:
# PEFT config
from peft import LoraConfig

peft_config = LoraConfig(
  lora_alpha=16,
  lora_dropout=0.1,
  target_modules=["q","v"],
  r=64,
  bias = "lora_only",                  # bias="lora_only",  "none"
  task_type='SEQ_2_SEQ_LM',     # task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5 task_type='CAUSAL_LLM',
)

In [20]:
# weight_decay = 0.011,
# Create the Trainer
from transformers import TrainingArguments
from transformers import pipeline
import evaluate
import accelerate

trainingArgs = TrainingArguments(
  output_dir='output',
  num_train_epochs=3,       # 3
  auto_find_batch_size=True,
  per_device_train_batch_size=8,
  save_strategy='epoch',
  evaluation_strategy='epoch',
  per_device_eval_batch_size=4,
  # save_total_limit=3,     # 3
  push_to_hub=False,
  learning_rate=2e-4,
  # logging_steps=2,        # 2
  # max_steps=10,           # 10
)

In [21]:
# Supervised FineTuning Trainer
from trl import SFTTrainer
trainer = SFTTrainer(
  model = model,
  train_dataset=train_set,
  eval_dataset=val_set,
  peft_config=peft_config,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=prompt_template,
  args=trainingArgs,
  max_seq_length=512,   # default is 512
)

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

### Train Model

In [22]:
## Train model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2933,0.02426
2,0.0555,0.018239
3,0.0501,0.016371


TrainOutput(global_step=2316, training_loss=0.10772308372668858, metrics={'train_runtime': 1472.9863, 'train_samples_per_second': 12.568, 'train_steps_per_second': 1.572, 'total_flos': 3597931599888384.0, 'train_loss': 0.10772308372668858, 'epoch': 3.0})

### Save Model locally

In [51]:
# save pretrained model to a different name on local directory
save_pretrained_to = "./flan_t5_small_pretrained_samsum"
model.save_pretrained(save_pretrained_to)
tokenizer.save_pretrained(save_pretrained_to)

('./flan_t5_small_pretrained_samsum/tokenizer_config.json',
 './flan_t5_small_pretrained_samsum/special_tokens_map.json',
 './flan_t5_small_pretrained_samsum/spiece.model',
 './flan_t5_small_pretrained_samsum/added_tokens.json',
 './flan_t5_small_pretrained_samsum/tokenizer.json')

### Using the WRITE token push your model to Huggingface_Hub

In [55]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)

In [60]:
## write token
from huggingface_hub import notebook_login
print(os.environ.get('HF_WRITE'))
notebook_login()

None


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [64]:
### https://huggingface.co/docs/transformers/model_sharing
## save tokenizer and create model card

## Before pushing to HUB load your best Model and Best Model tokenizer and push them
## CHECK STEPS in above links BEFORE SAVING MODEL TO HUB
## push to the hub
## finetune_model.push_to_hub(<repo on Huggingface_hub, config=config)
## fine_tokenizer.push_to_hub(<repo>)
## trainer.push_to_hub(<repo>)

## create model card   -- best to create on web
## trainer.create_model_card()

### To use uploaded dataset, use notebook_login with READ token

In [65]:
# # Read token
# print(os.environ.get('HF_READ'))
# notebook_login()

## reload
# model = <model-name>.from_pretrained("username/<model-name-saved-to-by-HF>")

### load model from local directory

In [26]:
last_checkpoint = "./output/checkpoint-2316"
# last_checkpoint = "./flan_t5_small_pretrained_samsum_dataset"
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(last_checkpoint, local_files_only=True)
tuned_tokenizer = AutoTokenizer.from_pretrained(last_checkpoint)

### select one sample from test set to verify

In [27]:
# select a random test sample  -- selects only one
from random import randrange
sample = test_set[randrange(len(test_set))]

# sample is a dict with dict_keys(['id', 'dialogue', 'summary'])
print(f"dialogue: \n{sample['dialogue']}", end="\n\n")

dialogue: 
Richie: Pogba
Clay: Pogboom
Richie: what a s strike yoh!
Clay: was off the seat the moment he chopped the ball back to his right foot
Richie: me too dude
Clay: hope his form lasts
Richie: This season he's more mature
Clay: Yeah, Jose has his trust in him
Richie: everyone does
Clay: yeah, he really deserved to score after his first 60 minutes
Richie: reward
Clay: yeah man
Richie: cool then 
Clay: cool



In [28]:
inputs = tuned_tokenizer(sample["dialogue"], return_tensors="pt")
output = fine_tuned_model.generate(**inputs, max_new_tokens=300)      # set max_new_tokens to limit
## print outputs
answer = tuned_tokenizer.decode(output[0])
print(answer)

<pad> Richie: Pogba Clay: Pogboom Richie: what a s strike yoh! Clay: was off the seat the moment he chopped the ball back to his right foot Richie: me too dude Clay: hope his form lasts Richie: This season he's more mature Clay: Yeah, Jose has his trust in him Richie: everyone does Clay: yeah, he really deserved to score after his first 60 minutes Richie: reward Clay: yeah man Richie: cool then Clay: cool</s>


### Inference test set

In [31]:
# execute below code to inference test model
from transformers import pipeline

# summarizer = pipeline("summarization", model="your model path in huggingface-hub", tokenizer="tokenizer")
summarizer = pipeline(task="summarization", model=fine_tuned_model, tokenizer=tuned_tokenizer)

# select a random test sample
sample = test_set[randrange(len(test_set))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-small summary:\n{res[0]['summary_text']}")

dialogue: 
Lincoln: Heeyyy ;* whats up
Fatima: I talked to Jenson, he’s not too happy ;p
Lincoln: the place sucks??
Fatima: No, the place is ok, I think, we can go there, it’s about Alene
Lincoln: typical, dont worry about it
Fatima: He thinks she may have a depression :[
Lincoln: nothin new, everyone has it, she needs a doctor then
Fatima: But she won’t go ;/
Lincoln: so she’s destroying her life fuck it its not your problem
Fatima: It is, they’re both my friends!
Lincoln: you better think what to do if they break up
Fatima: Ehh yes Ill have a problem ;//
Lincoln: both blaming each other and talking with you about it, perfect
Fatima: Alene is just troubled… She’d been through a lot…
Lincoln: everyone has their problems, the question is are ya doin sth about them
Fatima: She has problems facing it, don’t be surprised :[
Lincoln: then it is her problem
Fatima: You are so cruel at times… o.O
Lincoln: maybe, for me its just a common sense
Fatima: Why can’t everyone be j

In [32]:
test_data = test_set.to_pandas()
input_dialogue = test_data['dialogue'].values.tolist()
input_summary = test_data['summary'].values.tolist()

In [33]:
## REF https://medium.com/geekculture/pipelines-for-performant-inferences-with-hugging-face-5140300522de
from transformers import pipeline
def predict_using_pipelines(dialogue):
    def dialog_summa_pair_data():
        for d1 in dialogue:
            yield d1

    # https://huggingface.co/docs/transformers/pad_truncation
    pipe = pipeline("summarization",
                    model=fine_tuned_model,
                    tokenizer=tuned_tokenizer,
                    truncation="do_not_truncate", # must use this setting
                    # padding=True,    # error use any value
                    # max_length=66,   # ignore warnings - don't set: since truncation="do_not_truncate"
                    device=-1)         # device -1 for CPU
    completion = []
    for out in pipe(dialog_summa_pair_data(), batch_size=1):
        # print(type(out), out[])
        completion.append(out[0]['summary_text'])
    return completion

In [34]:
outputs = predict_using_pipelines(dialogue=input_dialogue)

Your max_length is set to 200, but your input_length is only 133. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)
Your max_length is set to 200, but your input_length is only 155. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=77)
Your max_length is set to 200, but your input_length is only 196. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)
Your max_length is set to 200, but your input_length is only 140. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)


In [35]:
# Capture predictions to a Pandas DF
flan_t5_small_samsum_predicted = pd.DataFrame.from_dict(outputs, orient='columns')
flan_t5_small_samsum_predicted.columns=['Predicted']
flan_t5_small_samsum_predicted.head()

Unnamed: 0,Predicted
0,"Amanda: Bye bye. Alright Hannah: Hey, do you h..."
1,:) Rob: And it's really funny! Eric: I know! I...
2,: what matters is what you'll give you the mos...
3,"will be home soon, i'll tell you when I get ho..."
4,you just call me and the all thing i heard was...


In [37]:
# create a Dataframe with original testset and predicted summary
test_data_with_predicted = test_data.copy()
test_data_with_predicted['predicted'] = flan_t5_small_samsum_predicted['Predicted']
test_data_with_predicted.head()

Unnamed: 0,id,dialogue,summary,predicted
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...,"Amanda: Bye bye. Alright Hannah: Hey, do you h..."
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...,:) Rob: And it's really funny! Eric: I know! I...
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...,: what matters is what you'll give you the mos...
3,13729438,"Will: hey babe, what do you want for dinner to...",Emma will be home soon and she will let Will k...,"will be home soon, i'll tell you when I get ho..."
4,13828600,"Ollie: Hi , are you in Warsaw\r\nJane: yes, ju...",Jane is in Warsaw. Ollie and Jane has a party....,you just call me and the all thing i heard was...


In [39]:
# A sample print
print(test_data_with_predicted['predicted'][1], '\n', input_summary[1])

:) Rob: And it's really funny! Eric: I know! I especially like the train part! Rob: Hahaha! No one talks to the machine like that! Erik: Is this his only stand-up? Rob: Idk. I'll check. Eric: Sure. Rob: Turns out no! There are some of his stand- ups on youtube. Erik: Gr8! I' will watch them now! Rob! Me too! Eric' MACHINE! Rob; MACHine! Eric! TTYL? Rob! Sure .) 
 Eric and Rob are going to watch a stand-up on youtube.


[Fabiano Falcão - Metrics for evaluating summarization of texts performed by Transformers: how to evaluate the quality of summaries](https://fabianofalcao.medium.com/metrics-for-evaluating-summarization-of-texts-performed-by-transformers-how-to-evaluate-the-b3ce68a309c3)

### Calculate Metric - Rouge Score

**ROUGE-1:** also known as unigram, measures the overlap of unigrams (individual words) between the generated summary and the reference summary. It calculates the proportion of words in the generated summary that are also present in the reference summary. Example: Reference text: “The cat is on the rug” Generated text: “The dog is on the rug” ROUGE-1 = 3/5 = 0.6.

**ROUGE-2:** also known as bigram, measures the overlap of bigrams (pairs of consecutive words) between the generated summary and the reference summary. It calculates the proportion of bigrams in the generated summary that are also present in the reference summary.

**ROUGE-L:** measures the similarity between the word sequence of the generated abstract and the reference abstract using the longest sequence of words in common. Unlike ROUGE-1 and ROUGE-2, which use a simple word count approach, ROUGE-L uses a string matching approach.

**ROUGE-Lsum:** is a variation of ROUGE-L that divides the generated summary and the reference summary into sentence units and measures the similarity between these sentence units.

In [40]:
from evaluate import load

# Load the ROUGE metric
import evaluate
rouge = evaluate.load('rouge')
# candidates = ["Summarization is cool","I love Machine Learning","Good night"]
# references = [["Summarization is beneficial and cool","Summarization saves time"],
# ["People are getting used to Machine Learning","I think i love Machine Learning"],
# ["Good night everyone!","Night!"]]
results = rouge.compute(predictions=test_data_with_predicted['predicted'], references=test_data['summary'])
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.26705610185379536, 'rouge2': 0.08099523698855657, 'rougeL': 0.1961449276122904, 'rougeLsum': 0.19608133993792493}


### Calculate Metric - BLUE Score

**BLEU** (Bilingual Evaluation Understudy)
BLEU a metric used to evaluate the quality of machine translation from one language to another. However, it can also be used to evaluate the quality of automatic text summarization.

The BLEU metric compares the template-generated text to the reference text (either the original text or a shortened version of the original text) and assigns a score based on word overlap between the two texts.

The **BLEU** score ranges from 0 to 1. The closer to 1, the better the quality of the summary. To calculate the BLEU score, we use a formula that takes into account the amount of overlapping words between the generated text and the reference text.

The more words in common, the higher the BLEU score.

In [41]:
## https://huggingface.co/spaces/evaluate-metric/bleu
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=test_data_with_predicted['predicted'], references=test_data['summary'])
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.029657085644641264, 'precisions': [0.13898711242292294, 0.03923453467220751, 0.017080838323353293, 0.008305421257634773], 'brevity_penalty': 1.0, 'length_ratio': 3.7029542257331456, 'translation_length': 68438, 'reference_length': 18482}


<h3>BERTScore</h3>
**BERTScore** is an automatic evaluation metric for text generation that computes a similarity score for each token in the candidate sentence with each token in the reference sentence. It leverages the pre-trained contextual embeddings from BERT - https://huggingface.co/bert-base-uncased models and matches words in candidate and reference sentences by cosine similarity.

Moreover, **BERTScore** computes precision, recall, and F1 measure, which can be useful for evaluating different language generation tasks.

REF: https://huggingface.co/spaces/evaluate-metric/bertscore

<h3>Thank you for checking out this Notebook</h3>