# T5 Fine-Tuning

## Part I - Data Preprocessing

### A. Data Loading

In [24]:
import pandas as pd
import os

datasets = '../data/dataset'
files_list = os.listdir(datasets)

print(files_list)

['dataset_20240704_150822.csv', 'dataset_20240704_151424.csv', 'dataset_20240704_151901.csv', 'dataset_20240704_152357.csv', 'dataset_20240704_152812.csv', 'dataset_20240704_154639.csv', 'dataset_20240704_155436.csv', 'dataset_20240704_160014.csv', 'dataset_20240704_160759.csv', 'dataset_20240704_161256.csv', 'dataset_20240704_162030.csv', 'dataset_20240704_164120.csv', 'dataset_20240704_165808.csv', 'dataset_20240704_172021.csv', 'dataset_20240704_173948.csv', 'dataset_20240704_175152.csv', 'dataset_20240704_180117.csv', 'dataset_20240704_181213.csv', 'dataset_20240704_182558.csv', 'dataset_20240704_184053.csv', 'dataset_20240704_185236.csv', 'dataset_20240704_190449.csv', 'dataset_20240704_191929.csv', 'dataset_20240704_192809.csv', 'dataset_20240704_193055.csv', 'dataset_20240704_193834.csv', 'dataset_20240704_194959.csv', 'dataset_20240704_200424.csv', 'dataset_20240704_201639.csv', 'dataset_20240704_202421.csv', 'dataset_20240704_203553.csv', 'dataset_20240704_203820.csv', 'datase

In [25]:
data = pd.DataFrame()

for file in files_list:
    filepath = os.path.join(datasets, file)
    df = pd.read_csv(filepath)
    data = pd.concat([data, df], ignore_index=True)

data.head(5)

Unnamed: 0,audio_path,transcript,summary
0,../data/audio\000000.mp3,"\n\tON the north-east coast of Scotland, in th...",\n \n\tThe history of the family of the...
1,../data/audio\000001.mp3,\n\tALLEYN was no where to be found. The Earl ...,\n \n\tThere is an attack and an impend...
2,../data/audio\000002.mp3,\n\tTHE Count was walking on the ramparts of t...,\n \n\tMalcolm reveals an important sec...
3,../data/audio\000003.mp3,\n\tMEANWHILE the Earl remained a solitary pri...,\n \n\tMatilda falls into despair over ...
4,../data/audio\000004.mp3,"\n\tMARY, in the mean time, suffered all the t...",\n \n\tIdentities are revealed and the ...


In [26]:
data.insert(0,'id','')
data['id'] = data.audio_path.str[-10:]
data.head(5)

Unnamed: 0,id,audio_path,transcript,summary
0,000000.mp3,../data/audio\000000.mp3,"\n\tON the north-east coast of Scotland, in th...",\n \n\tThe history of the family of the...
1,000001.mp3,../data/audio\000001.mp3,\n\tALLEYN was no where to be found. The Earl ...,\n \n\tThere is an attack and an impend...
2,000002.mp3,../data/audio\000002.mp3,\n\tTHE Count was walking on the ramparts of t...,\n \n\tMalcolm reveals an important sec...
3,000003.mp3,../data/audio\000003.mp3,\n\tMEANWHILE the Earl remained a solitary pri...,\n \n\tMatilda falls into despair over ...
4,000004.mp3,../data/audio\000004.mp3,"\n\tMARY, in the mean time, suffered all the t...",\n \n\tIdentities are revealed and the ...


In [27]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name='google/flan-t5-base'

T5 = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

### B. Data Preprocessing

In [28]:
from datasets import Dataset
dataset = Dataset.from_pandas(data)

In [29]:
def preprocess_function(examples):
    inputs = examples['transcript']
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [30]:
def preprocess_function(examples):
    inputs = [
        f"Summarize the following conversation.\n\n### Input:\n{transcript}\n\n### Summary:\n"
        for transcript in examples['transcript']
    ]
    
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [31]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map:   0%|          | 0/390 [00:00<?, ? examples/s]



In [32]:
example = tokenized_dataset[0]
input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
summary_text = tokenizer.decode(example['labels'], skip_special_tokens=True)

print("Training Prompt Example:")
print("Input Text:\n", input_text)
print("\nSummary Text:\n", summary_text)

Training Prompt Example:
Input Text:
 Summarize the following conversation. ### Input: ON the north-east coast of Scotland, in the most romantic part of the Highlands, stood the Castle of Athlin; an edifice built on the summit of a rock whose base was in the sea. This pile was venerable from its antiquity, and from its Gothic structure; but more venerable from the virtues which it enclosed. It was the residence of the still beautiful widow, and the children of the noble Earl of Athlin, who was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful; and still residing in all the pomp of feudal greatness, within a few miles of the castle of Athlin. Encroachment on the domain of Athlin, was the occasion of the animosity which subsisted between the chiefs. Frequent broils had happened between their clans, in which that of Athlin had generally been victorious. Malcolm, whose pride was touched by the defeat of his people; whose ambition was curbed by the authority,

## Part II - Model Fine-Tuning

### A. Peft Config

In [33]:
import time
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModel, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(T5, lora_config)

output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'


In [34]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=T5)

In [35]:
rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    return result


training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=10_000,
    eval_steps=10_000,
    logging_dir='./logs',
    logging_steps=200,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)



### B. Training Phase

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clearing any existing cache (if applicable) in case we run the code a multiple times and RAM is full
if device.type == "cuda":
    torch.cuda.empty_cache()

In [37]:
peft_trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

{'loss': 2.4734, 'grad_norm': 0.55859375, 'learning_rate': 0.0009000000000000001, 'epoch': 0.23}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.4270832538604736, 'eval_runtime': 25.6343, 'eval_samples_per_second': 1.521, 'eval_steps_per_second': 0.195, 'epoch': 0.23}
{'loss': 2.4773, 'grad_norm': 0.8828125, 'learning_rate': 0.0008, 'epoch': 0.45}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.427884578704834, 'eval_runtime': 25.7856, 'eval_samples_per_second': 1.512, 'eval_steps_per_second': 0.194, 'epoch': 0.45}
{'loss': 2.5531, 'grad_norm': 0.62109375, 'learning_rate': 0.0007, 'epoch': 0.68}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.4054486751556396, 'eval_runtime': 25.4291, 'eval_samples_per_second': 1.534, 'eval_steps_per_second': 0.197, 'epoch': 0.68}
{'loss': 2.4133, 'grad_norm': 0.70703125, 'learning_rate': 0.0006, 'epoch': 0.91}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.3994390964508057, 'eval_runtime': 25.4482, 'eval_samples_per_second': 1.533, 'eval_steps_per_second': 0.196, 'epoch': 0.91}
{'loss': 2.4172, 'grad_norm': 0.875, 'learning_rate': 0.0005, 'epoch': 1.14}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.399038553237915, 'eval_runtime': 30.7231, 'eval_samples_per_second': 1.269, 'eval_steps_per_second': 0.163, 'epoch': 1.14}
{'loss': 2.3328, 'grad_norm': 0.92578125, 'learning_rate': 0.0004, 'epoch': 1.36}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.3866186141967773, 'eval_runtime': 25.452, 'eval_samples_per_second': 1.532, 'eval_steps_per_second': 0.196, 'epoch': 1.36}
{'loss': 2.3523, 'grad_norm': 0.7421875, 'learning_rate': 0.0003, 'epoch': 1.59}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.3890223503112793, 'eval_runtime': 25.4191, 'eval_samples_per_second': 1.534, 'eval_steps_per_second': 0.197, 'epoch': 1.59}
{'loss': 2.4328, 'grad_norm': 0.87109375, 'learning_rate': 0.0002, 'epoch': 1.82}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.379807710647583, 'eval_runtime': 25.2805, 'eval_samples_per_second': 1.543, 'eval_steps_per_second': 0.198, 'epoch': 1.82}
{'loss': 2.4172, 'grad_norm': 0.59765625, 'learning_rate': 0.0001, 'epoch': 2.05}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.3669872283935547, 'eval_runtime': 25.324, 'eval_samples_per_second': 1.54, 'eval_steps_per_second': 0.197, 'epoch': 2.05}
{'loss': 2.2898, 'grad_norm': 0.58203125, 'learning_rate': 0.0, 'epoch': 2.27}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 2.364182710647583, 'eval_runtime': 25.3438, 'eval_samples_per_second': 1.539, 'eval_steps_per_second': 0.197, 'epoch': 2.27}
{'train_runtime': 1601.6538, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.062, 'train_loss': 2.4159375, 'epoch': 2.27}


TrainOutput(global_step=100, training_loss=2.4159375, metrics={'train_runtime': 1601.6538, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.062, 'total_flos': 1101548300009472.0, 'train_loss': 2.4159375, 'epoch': 2.2727272727272725})

In [38]:
peft_model_path = "./peft-t5-summary"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-t5-summary\\tokenizer_config.json',
 './peft-t5-summary\\special_tokens_map.json',
 './peft-t5-summary\\spiece.model',
 './peft-t5-summary\\added_tokens.json',
 './peft-t5-summary\\tokenizer.json')

### C. Evaluation

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clearing any existing cache (if applicable) in case we run the code a multiple times and RAM is full
if device.type == "cuda":
    torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

original_model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base').to(device)
fine_tuned_model = PeftModel.from_pretrained(original_model, peft_model_path).to(device)

In [41]:
ARTICLE_TO_SUMMARIZE = data['transcript'][0]

prompt = f"""

Summarize the following conversation:
{ARTICLE_TO_SUMMARIZE}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)
output_PEFT = tokenizer.decode(fine_tuned_model.model.generate(input_ids, max_new_tokens=100)[0], skip_special_tokens=True)

In [42]:
summary = data['summary'][0]

if len(output_PEFT) > len(summary):
    output_PEFT = output_PEFT[:len(summary)]
else:
    summary = summary[:len(output_PEFT)]

PEFT_model_results = rouge.compute(
    predictions=output_PEFT,
    references=summary,
    use_aggregator=True,
    use_stemmer=True,
)

print("----"*20)
print("PEFT MODEL:")
print(PEFT_model_results)

--------------------------------------------------------------------------------
PEFT MODEL:
{'rouge1': 0.08163265306122448, 'rouge2': 0.0, 'rougeL': 0.08163265306122448, 'rougeLsum': 0.08163265306122448}


In [45]:
output_T5 = tokenizer.decode(T5.generate(input_ids, max_new_tokens=100)[0], skip_special_tokens=True)

In [46]:
dash_line = "----"*15

print(dash_line)
print("ORIGINAL TEXT:")
print(data['transcript'][0])

print(dash_line)
print("HUMAN SUMMARY:")
print(data['summary'][0])

print(dash_line)
print("ORIGINAL MODEL SUMMARY:")
print(output_T5)

print(dash_line)
print("FINE-TUNED MODEL SUMMARY:")
print(output_PEFT)

------------------------------------------------------------
ORIGINAL TEXT:

	ON the north-east coast of Scotland, in the most romantic part of the Highlands, stood the Castle of Athlin; an edifice built on the summit of a rock whose base was in the sea. This pile was venerable from its antiquity, and from its Gothic structure; but more venerable from the virtues which it enclosed. It was the residence of the still beautiful widow, and the children of the noble Earl of Athlin, who was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful; and still residing in all the pomp of feudal greatness, within a few miles of the castle of Athlin. Encroachment on the domain of Athlin, was the occasion of the animosity which subsisted between the chiefs. Frequent broils had happened between their clans, in which that of Athlin had generally been victorious. Malcolm, whose pride was touched by the defeat of his people; whose ambition was curbed by the authority, and whos