##Module Imports

In [None]:
!pip install datasets

In [None]:
!pip install evaluate --quiet

In [None]:
!pip install accelerate -U

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import accelerate

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install rouge-score --quiet

##Loading the Dataset and FLAN-T5 Model

In [None]:
dataset = load_dataset("knkarthick/dialogsum")

In [None]:
model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def get_nof_trainable_parameters(model):
  trainable_model_params = 0
  all_model_parameters = 0
  for _, param in model.named_parameters():
    all_model_parameters += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
  return f"trainable model parameters: {trainable_model_params} \n all model parameters: {all_model_parameters} \n percentage of trainable parameters: {trainable_model_params*100/all_model_parameters}"

get_nof_trainable_parameters(original_model)

'trainable model parameters: 247577856 \n all model parameters: 247577856 \n percentage of trainable parameters: 100.0'

##Performing Dialogue Summarization with Prompt Engineering - Zero Shot

In [None]:
index = 100

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation:
{dialogue}

Summary:
{summary}
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(inputs["input_ids"], max_new_tokens=200)[0],
    skip_special_tokens=True
)

dash_line = "-".join('' for x in range(100))
print(dash_line)
print(f"INPUT_PROMPT: \n {prompt}")
print(dash_line)
print(f"BASELINE HUMAN SUMMARY: \n {summary} \n")
print(dash_line)
print(f"MODEL_GENERATION - ZERO SHOT: \n {output}")

---------------------------------------------------------------------------------------------------
INPUT_PROMPT: 
 
Summarize the following conversation:
#Person1#: OK, that's a cut! Let's start from the beginning, everyone.
#Person2#: What was the problem that time?
#Person1#: The feeling was all wrong, Mike. She is telling you that she doesn't want to see you any more, but I want to get more anger from you. You're acting hurt and sad, but that's not how your character would act in this situation.
#Person2#: But Jason and Laura have been together for three years. Don't you think his reaction would be one of both anger and sadness?
#Person1#: At this point, no. I think he would react the way most guys would, and then later on, we would see his real feelings.
#Person2#: I'm not so sure about that.
#Person1#: Let's try it my way, and you can see how you feel when you're saying your lines. After that, if it still doesn't feel right, we can try something else.

Summary:
#Person1# and Mike

##Performing Dialogue Summarization with Full Fine-tuning

###Preprocess Data

In [None]:
def tokenize_function(example):
  start = "Summarize the following conversation.\n\n"
  end = "\n\nSummary:"
  prompt = [start + dialogue + end for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True).input_ids
  example['labels'] = tokenizer(example['summary'], return_tensors='pt', padding='max_length', truncation=True).input_ids

  return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

#### Subsampling

In [None]:
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index%100 == 0, with_indices=True)

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
print("Shape of datasets:\n")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Testing: {tokenized_dataset['test'].shape}")
print(f"Validation: {tokenized_dataset['validation'].shape}")

Shape of datasets:

Training: (125, 2)
Testing: (15, 2)
Validation: (5, 2)


In [None]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


##Initialise Trainer for Training the LLM

The below mentioned code for training is executed only for 1 epoch and a subsample of dataset due to resource constraints. When trained on preferably 32GB RAM as provided by AWS, the performance will be stellar and in-par with benchmark results.

In [None]:
output_dir = "/content/model"

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    max_steps = 1
)

trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation']
)

In [None]:
trainer.train()

Step,Training Loss
1,51.5


TrainOutput(global_step=1, training_loss=51.5, metrics={'train_runtime': 2.9665, 'train_samples_per_second': 2.697, 'train_steps_per_second': 0.337, 'total_flos': 5478058819584.0, 'train_loss': 51.5, 'epoch': 0.06})

In [None]:
trainer.save_model(output_dir)

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('/content/model', torch_dtype=torch.bfloat16)

##Comparing and Evaluating Model using ROUGE

In [None]:
index = 100

dialogue = dataset['test'][index]['dialogue']
baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation:
{dialogue}

Summary:
{summary}

"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f"INPUT_PROMPT: \n {prompt}")
print(dash_line)
print(f"BASELINE HUMAN SUMMARY: \n {baseline_summary} \n")
print(dash_line)
print(f"ORIGINAL MODEL: \n {original_model_text_output}")
print(dash_line)
print(f"INSTRUCT MODEL: \n {instruct_model_text_output}")

---------------------------------------------------------------------------------------------------
INPUT_PROMPT: 
 
Summarize the following conversation:
#Person1#: OK, that's a cut! Let's start from the beginning, everyone.
#Person2#: What was the problem that time?
#Person1#: The feeling was all wrong, Mike. She is telling you that she doesn't want to see you any more, but I want to get more anger from you. You're acting hurt and sad, but that's not how your character would act in this situation.
#Person2#: But Jason and Laura have been together for three years. Don't you think his reaction would be one of both anger and sadness?
#Person1#: At this point, no. I think he would react the way most guys would, and then later on, we would see his real feelings.
#Person2#: I'm not so sure about that.
#Person1#: Let's try it my way, and you can see how you feel when you're saying your lines. After that, if it still doesn't feel right, we can try something else.

Summary:
#Person1# and Mike

In [None]:
dialogues = dataset['test'][0:10]['dialogue']
baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
  prompt = f"""
Summarize the following conversation:
{dialogue}

Summary:
"""
  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')

  original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
  instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
  instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['Human Baseline Summary', 'Original Model Summary', 'Instruct Model Summary'])
df

Unnamed: 0,Human Baseline Summary,Original Model Summary,Instruct Model Summary
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: This memo should go out as an intra...,#Person1#: I need to take a dictation for you....
1,In order to prevent employees from wasting tim...,The memo is to be sent to all employees by thi...,#Person1#: I need to take a dictation for you....
2,Ms. Dawson takes a dictation for #Person1# abo...,Employees who use instant messaging will be su...,#Person1#: I need to take a dictation for you....
3,#Person2# arrives late because of traffic jam....,The driver of the car is a man who is a man wh...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,"#Person1: It's a long, long trip. #Person2: It...",The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,#Person1#: I'm finally here. #Person2#: I got ...,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,"Kate, Masha and Hero are getting divorced.",Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,#Person1: Masha and Hero are getting divorced....,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,Brian's birthday is today.,"#Person1#: Happy Birthday, Brian. #Person2#: I..."


In [None]:
rouge = evaluate.load('rouge')

In [None]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print("ORIGINAL MODEL:")
print(original_model_results)
print("INSTRUCT MODEL:")
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2668182071603781, 'rouge2': 0.09052095573834704, 'rougeL': 0.23027698838052996, 'rougeLsum': 0.23345880656234813}
INSTRUCT MODEL:
{'rouge1': 0.30356701280839216, 'rouge2': 0.12987050662427171, 'rougeL': 0.2531638942433545, 'rougeLsum': 0.2565751595286828}


In [None]:
print("Absolute Percentage Improvement of Instruct Model over Human Baseline")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
  print(f"{key}: {value*100:.2f}%")

Absolute Percentage Improvement of Instruct Model over Human Baseline
rouge1: 3.67%
rouge2: 3.93%
rougeL: 2.29%
rougeLsum: 2.31%
