# Fine Tuning LLM
### Install Libraries

In [None]:
%pip install transformers datasets evaluate accelerate
%pip install torch torchdata
%pip install peft
%pip install loralib
%pip install bert_score
%pip install rouge_score

In [None]:
import pandas as pd
import numpy as np
import gc
from datetime import date
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import evaluate
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig

### Config

In [None]:
# training directory
DIR_TRAIN = "./peft/train/"
DIR_MODEL = "./peft/models/"

### Fine Tuning Dataset

In [None]:
# download the dataset
dataset = load_dataset('knkarthick/dialogsum')
dataset

### Foundational Model

In [None]:
model_name = 'google/flan-t5-base'
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer =AutoTokenizer.from_pretrained(model_name)

In [None]:
def prepare_prompt(dialogue):
    """"""
    prompt = "Summarize the following conversation:\n\n"
    prompt += f"Conversation: << {dialogue} >> "
    prompt += "\n\n Summary:"
    return prompt

In [None]:
def generate_summaries(data, model, tokenizer, indexes,verbose=True):
    """"""
    summaries=[]
    for idx in indexes:
        dialogue = data['test'][idx]['dialogue']
        human_summary = data['test'][idx]['summary']

        # tokenize the input dialogue into tokens
        tokens = tokenizer(prepare_prompt(dialogue), return_tensors='pt')

        # get the model summary, decode from tokens back to text
        genai_summary = tokenizer.decode(
            model.generate(
                input_ids=tokens['input_ids'],
                max_new_tokens = 200,
            )[0],
            skip_special_tokens = True
        )

        # accumulate summaries
        summaries.append(
            {'index':idx,'human':human_summary,'genai':genai_summary}
        )

        #optional print
        if verbose:
          # print the summaries
          print(f"Dialogue:{idx}")
          print("="*100)
          print(dialogue)
          print("\nHuman Summary",'-'*88)
          print(human_summary,'\n')
          print("GenAI Summary",'-'*88)
          print(genai_summary)
          print("="*100,'\n')
    return summaries

# sample generations from the base model
summaries = generate_summaries(dataset,base_model,tokenizer,[40,200],True)

### Preprocessing

In [None]:
def preprocess(example):
    """"""
    prompt_start = "Summarize the following conversation: \n\n"
    prompt_end = "\n\nSummary:"
    prompt = [prompt_start + d + prompt_end for d in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding="max_length",truncation=True, return_tensors='pt').input_ids

    return example

In [None]:
# preprocess the data (prepare the prompts and tokenize the inputs)
tokenized_datasets = dataset.map(preprocess, batched=True)
tokenized_datasets =tokenized_datasets.remove_columns(['id','dialogue','summary','topic'])
print(tokenized_datasets)

# Filter to speed up training on CPU (keep every 100th observation)
tokenized_datasets = tokenized_datasets.filter(lambda example, index:index % 100 ==0, with_indices=True)
print(tokenized_datasets)

## PEFT
#### LoRA Config

In [None]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha=32,
    target_modules = ["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(base_model, lora_config)

### Training Config

In [None]:
config_training = TrainingArguments(
    output_dir=DIR_TRAIN,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=100,
    logging_steps=1,
    max_steps=100
)
trainer_peft = Trainer(
    model=peft_model,
    args=config_training,
    train_dataset=tokenized_datasets['train']
)

# train
trainer_peft.train()
trainer_peft.model.save_pretrained(DIR_MODEL)
tokenizer.save_pretrained(DIR_MODEL)

### Merge Models

In [None]:
# merge base model + peft adaptors
tuned_model = PeftModel.from_pretrained(
    base_model,
    '/content/peft/models',
    torch_dthype=torch.bfloat16,
    is_trainable=False
  )

In [None]:
generate_summaries(dataset,tuned_model,tokenizer,[40,200],True)

### Evaluate Model Performance

In [None]:
def evaluate_models(summaries,eval_metric = 'bert',aggregate=True):
  """"""

  # rouge Scores
  if eval_metric == 'rouge':
    rouge = evaluate.load('rouge')
    scores = rouge.compute(
      predictions = [s['genai'] for s in summaries],
      references = [s['human'] for s in summaries],
      use_aggregator=aggregate,
      use_stemmer=True,
      rouge_types = ['rougeL']
    )

  # BERT scores
  elif eval_metric == 'bert':
    bert = evaluate.load("bertscore")
    scores = bert.compute(
        predictions = [s['genai'] for s in summaries],
        references = [s['human'] for s in summaries],
        lang='en'
    )
    if aggregate:
      scores = np.mean(scores['f1'])
  else:
    raise Exception("eval_metric must be bert or rouge!")

  return scores

In [None]:
# generate summaries for sample test data
test_indexes = [40,100,150,200]
summaries_base_model = generate_summaries(dataset, base_model, tokenizer, test_indexes,False)
summaries_tuned_model = generate_summaries(dataset, tuned_model, tokenizer, test_indexes,False)

# Evaluate the performace vs the human labels
# base model:
print(evaluate_models(summaries_base_model,'bert',True))

# tuned model
print(evaluate_models(summaries_tuned_model,'bert',True))
