### Install Libraries

In [27]:
# %pip install transformers datasets evaluate accelerate
# %pip install torch torchdata
# %pip intsall peft
# %pip install rouge_score
# %pip install loralib

In [1]:
import pandas as pd
import numpy as np
import gc
from datetime import date
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import evaluate
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

In [2]:
# training directory
DIR_TRAIN = f'./peft/train/{date.today().strftime("%Y-%m-%d")}/'
DIR_MODEL = "./peft/models/"

### Select the Fine Tuning Dataset

In [3]:
hf_dataset = 'knkarthick/dialogsum'
dataset = load_dataset(hf_dataset)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

### Select the Foundational Model

In [4]:
model_name = 'google/flan-t5-base'
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer =AutoTokenizer.from_pretrained(model_name)

In [5]:
# config = GenerationConfig(max_new_tokens = 50, do_sample = False)
# config = GenerationConfig(max_new_tokens = 50, do_sample = True,temperature = 0.1)

In [6]:
def prepare_prompt(dialogue):
    """"""
    prompt = "Summarize the following conversation:\n\n"
    prompt += f"Conversation: << {dialogue} >> "
    prompt += "\n\n Summary:"
    return prompt

Summarize the following conversation:

Conversation: << this is a test prompt >> 

 Summary:


In [7]:
def generate_summaries(data, model, tokenizer, indexes):
    """"""
    for idx in indexes:
        dialogue = data['test'][idx]['dialogue']
        human_summary = data['test'][idx]['summary']

        # tokenize the input dialogue into tokens
        tokens = tokenizer(prepare_prompt(dialogue), return_tensors='pt')

        # get the model summary, decode from tokens back to text
        genai_summary = tokenizer.decode(
            model.generate(
                inputs=tokens['input_ids'],
                max_new_tokens = 50,
            )[0],
            skip_special_tokens = True
        )

        # print the summaries
        print(f"Dialogue:{idx}")
        print("="*100)
        print(dialogue)
        print("\nHuman Summary",'-'*88)
        print(human_summary,'\n')
        print("GenAI Summary",'-'*88)
        print(genai_summary)
        print("="*100,'\n')

generate_summaries(dataset,base_model,tokenizer,[40,200])

Dialogue:40
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

Human Summary ----------------------------------------------------------------------------------------
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time. 

GenAI Summary ----------------------------------------------------------------------------------------
The train is about to leave, but Tom is late.

Dialogue:200
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: 

### PEFT/LoRA


In [8]:
def preprocess(example):
    """"""
    prompt_start = "Summarize the following conversation: \n\n"
    prompt_end = "\n\nSummary:"
    prompt = [prompt_start + d + prompt_end for d in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding="max_length",truncation=True, return_tensors='pt').input_ids

    return example

In [9]:

# preprocess the data (prepare the prompts and tokenize the inputs)
tokenized_datasets = dataset.map(preprocess, batched=True)
tokenized_datasets =tokenized_datasets.remove_columns(['id','dialogue','summary','topic'])
print(tokenized_datasets)

# Filter to speed up training on CPU (keep every 100th observation)
tokenized_datasets = tokenized_datasets.filter(lambda example, index:index % 100 ==0, with_indices=True)
print(tokenized_datasets)


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})


Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [10]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha=32,
    target_modules = ["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(base_model, lora_config)

In [12]:
gc.collect()
config_training = TrainingArguments(
    output_dir=DIR_TRAIN,
    auto_find_batch_size=True,
    learning_rate=0.003,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)
trainer_peft = Trainer(
    model=peft_model,
    args=config_training,
    train_dataset=tokenized_datasets['train']
)

# train
trainer_peft.train()
# trainer_peft.model.save_pretrained(DIR_MODEL)
# tokenizer.save_pretrained(DIR_MODEL)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_peft.model.save_pretrained(DIR_MODEL)
tokenizer.save_pretrained(DIR_MODEL)

In [None]:
peft_base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)