In [1]:
#%pip install --upgrade pip
#%pip install --disable-pip-version-check \
#    torch==1.13.1 \
#    torchdata==0.5.1 --quiet
#
#%pip install \
#    transformers==4.27.2 \
#    datasets==2.11.0 \
#    evaluate==0.4.0 \
#    rouge_score==0.1.2 \
#    loralib==0.1.1 \
#    peft==0.3.0 --quiet
%pip install evaluate rouge_score loralib peft

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting peft
  Downloading peft-0.5.0-py3-none-any.whl.metadata (22 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting accelerate (from peft)
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Collecting click (from nltk->rouge_score)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (f

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name='google/flan-t5-base'
#model_name='google/flan-t5-large'
#model_name='google/flan-t5-xl'dddd
#model_name='google/flan-t5-xxl'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [5]:
original_model = original_model.to(device)
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').to(device)
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [6]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example


In [7]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
})

In [9]:
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})


In [12]:
#output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir = "./dialogue-summary-training"

training_args = TrainingArguments(
    output_dir=output_dir,
    #learning_rate=1e-5,
    learning_rate=1e-4,
    num_train_epochs=1,
    #num_train_epochs=45,
    #num_train_epochs=15,
    weight_decay=0.01,
    logging_steps=100,
    #max_steps=1
    #max_steps=100
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [13]:
trainer.train()
#trainer.train(resume_from_checkpoint=True)

Step,Training Loss
100,9.7978
200,1.4244
300,0.6618
400,0.4216
500,0.3288
600,0.2795
700,0.2649
800,0.2282
900,0.2317
1000,0.2238


TrainOutput(global_step=1558, training_loss=0.9709720244303595, metrics={'train_runtime': 306.8255, 'train_samples_per_second': 40.609, 'train_steps_per_second': 5.078, 'total_flos': 8532076611502080.0, 'train_loss': 0.9709720244303595, 'epoch': 1.0})

In [14]:
instruct_model_path="./dialogue-summary-checkpoint-local"

trainer.model.save_pretrained(instruct_model_path)
tokenizer.save_pretrained(instruct_model_path)

('./dialogue-summary-checkpoint-local/tokenizer_config.json',
 './dialogue-summary-checkpoint-local/special_tokens_map.json',
 './dialogue-summary-checkpoint-local/tokenizer.json')

In [15]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./dialogue-summary-checkpoint-local", torch_dtype=torch.bfloat16)
#instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)
#instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./fine-tune-test/dialogue-summary-checkpoint-local", torch_dtype=torch.bfloat16)

In [16]:
instruct_model = instruct_model.to(device)

In [17]:
#index = 200
index = 40

dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device)

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1# is off of the train.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# tells #Person2# that it's ten to nine by his watch. #Person2# tells #Person1# that #Person2# must catch the nine-thirty train.


In [18]:
rouge = evaluate.load('rouge')

In [19]:
#dialogues = dataset['test'][0:10]['dialogue']
dialogues = dataset['test'][0:20]['dialogue']
#human_baseline_summaries = dataset['test'][0:10]['summary']
human_baseline_summaries = dataset['test'][0:20]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    #input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1# takes a memo from Ms.W Dawson. #Pers...,#Person1# asks #Person2# to take a dictation f...
1,In order to prevent employees from wasting tim...,Memos to employees.,#Person1# asks #Person2# to take a dictation f...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1# is a senior and wants to take a dict...,#Person1# asks #Person2# to take a dictation f...
3,#Person2# arrives late because of traffic jam....,#Person1# is stuck in traffic again. #Person1#...,#Person1# is stuck in traffic again. #Person2#...
4,#Person2# decides to follow #Person1#'s sugges...,#Person2# is finally here. #Person1# is a form...,#Person1# is stuck in traffic again. #Person2#...
5,#Person2# complains to #Person1# about the tra...,#Person1# is finally here.,#Person1# is stuck in traffic again. #Person2#...
6,#Person1# tells Kate that Masha and Hero get d...,#Person1# is a divorcee.,#Person1# tells #Person2# that Masha and Hero ...
7,#Person1# tells Kate that Masha and Hero are g...,#Person1# tells #Person1# that #Person2# is th...,#Person1# tells #Person2# that Masha and Hero ...
8,#Person1# and Kate talk about the divorce betw...,#Person1# and #Person2# are getting divorced.,#Person1# tells #Person2# that Masha and Hero ...
9,#Person1# and Brian are at the birthday party ...,#Person1# is Brian's birthday.,#Person1# tells #Person2# that #Person1# is al...


In [20]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.271809963917092, 'rouge2': 0.06325247893213615, 'rougeL': 0.22200634021494214, 'rougeLsum': 0.2222869000073402}
INSTRUCT MODEL:
{'rouge1': 0.3305109704622176, 'rouge2': 0.10548967256427133, 'rougeL': 0.2628220708096669, 'rougeLsum': 0.2640990127221273}


In [21]:
#results = pd.read_csv("data/dialogue-summary-training-results.csv")
results = pd.read_csv("ddl-projects/Generative-AI-with-LLMs-Practices-on-Perlmutter/data/dialogue-summary-training-results.csv")

human_baseline_summaries = results['human_baseline_summaries'].values
original_model_summaries = results['original_model_summaries'].values
instruct_model_summaries = results['instruct_model_summaries'].values

print(len(instruct_model_summaries))

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)


instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

1500
ORIGINAL MODEL:
{'rouge1': 0.2334158581572823, 'rouge2': 0.07603964187010573, 'rougeL': 0.20145520923859048, 'rougeLsum': 0.20145899339006135}
INSTRUCT MODEL:
{'rouge1': 0.42161291557556113, 'rouge2': 0.18035380596301792, 'rougeL': 0.3384439349963909, 'rougeLsum': 0.33835653595561666}


In [22]:
print("Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE
rouge1: 18.82%
rouge2: 10.43%
rougeL: 13.70%
rougeLsum: 13.69%


In [23]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [24]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [25]:
output_dir = "./peft-dialogue-summary-training"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=2,
    logging_steps=500,
    #max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [26]:
peft_trainer.train()

Step,Training Loss
500,0.1259
1000,0.1063
1500,0.103
2000,0.1107
2500,0.0979
3000,0.0967


TrainOutput(global_step=3116, training_loss=0.1062980916288912, metrics={'train_runtime': 580.2135, 'train_samples_per_second': 42.95, 'train_steps_per_second': 5.37, 'total_flos': 1.733507439132672e+16, 'train_loss': 0.1062980916288912, 'epoch': 2.0})

In [27]:
peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [28]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
#peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", torch_dtype=torch.bfloat16)
#tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './peft-dialogue-summary-checkpoint-local/',  
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [29]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


In [30]:
peft_model = peft_model.to(device)

index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device)

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1# suggests adding a painting program to #Person2#'s software and adding a painting program to make up flyers and banners for advertising. #Person2# also wants to upgrade the hardware because it's pretty outdated now. #Person1# suggests adding a CD-ROM drive too.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests adding a painting program to his software. #Person2# suggests adding a painting program to his software. #Person2# suggests adding a CD-ROM drive.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1

In [31]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks #Person2# to take a dictation f...,Ms. Dawson tells #Person1# the new policy appl...
1,In order to prevent employees from wasting tim...,#Person1# takes dictation for Ms. Dawson. Ms. ...,#Person1# asks #Person2# to take a dictation f...,Ms. Dawson tells #Person1# the new policy appl...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks #Person2# to take a dictation f...,Ms. Dawson tells #Person1# the new policy appl...
3,#Person2# arrives late because of traffic jam....,#Person2# got stuck in traffic and #Person2# t...,#Person1# is stuck in traffic again. #Person2#...,#Person2# got stuck in traffic. #P1# suggests ...
4,#Person2# decides to follow #Person1#'s sugges...,#Person2# tells #Person1# that #Person2# got s...,#Person1# is stuck in traffic again. #Person2#...,#Person2# got stuck in traffic. #P1# suggests ...
5,#Person2# complains to #Person1# about the tra...,#Person2# gets stuck in traffic and thinks #Pe...,#Person1# is stuck in traffic again. #Person2#...,#Person2# got stuck in traffic. #P1# suggests ...
6,#Person1# tells Kate that Masha and Hero get d...,Kate tells #Person1# that Masha and Hero are g...,#Person1# tells #Person2# that Masha and Hero ...,Kate tells #P2# Masha and Hero are getting div...
7,#Person1# tells Kate that Masha and Hero are g...,Kate tells #Person2# about the divorce of Mash...,#Person1# tells #Person2# that Masha and Hero ...,Kate tells #P2# Masha and Hero are getting div...
8,#Person1# and Kate talk about the divorce betw...,Kate tells Kate Masha and Hero are getting div...,#Person1# tells #Person2# that Masha and Hero ...,Kate tells #P2# Masha and Hero are getting div...
9,#Person1# and Brian are at the birthday party ...,Brian is celebrating his birthday and Brian in...,#Person1# tells #Person2# that #Person1# is al...,Brian is celebrating his birthday. He has a da...


In [32]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.408332685870745, 'rouge2': 0.13002963539728246, 'rougeL': 0.2774424428066578, 'rougeLsum': 0.2790249850306735}
INSTRUCT MODEL:
{'rouge1': 0.3345959541711421, 'rouge2': 0.12333337761902405, 'rougeL': 0.2504438205904372, 'rougeLsum': 0.2531080135403444}
PEFT MODEL:
{'rouge1': 0.35177716445488216, 'rouge2': 0.09902176655001385, 'rougeL': 0.23439288945469816, 'rougeLsum': 0.2347627108061042}


In [33]:
human_baseline_summaries = results['human_baseline_summaries'].values
original_model_summaries = results['original_model_summaries'].values
instruct_model_summaries = results['instruct_model_summaries'].values
peft_model_summaries     = results['peft_model_summaries'].values

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2334158581572823, 'rouge2': 0.07603964187010573, 'rougeL': 0.20145520923859048, 'rougeLsum': 0.20145899339006135}
INSTRUCT MODEL:
{'rouge1': 0.42161291557556113, 'rouge2': 0.18035380596301792, 'rougeL': 0.3384439349963909, 'rougeLsum': 0.33835653595561666}
PEFT MODEL:
{'rouge1': 0.40810631575616746, 'rouge2': 0.1633255794568712, 'rougeL': 0.32507074586565354, 'rougeLsum': 0.3248950182867091}


In [34]:
print("Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE
rouge1: 17.47%
rouge2: 8.73%
rougeL: 12.36%
rougeLsum: 12.34%


In [35]:
print("Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(instruct_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL
rouge1: -1.35%
rouge2: -1.70%
rougeL: -1.34%
rougeLsum: -1.35%


In [36]:
#output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir = "./dialogue-summary-training-instruct"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    #num_train_epochs=45,
    #num_train_epochs=15,
    weight_decay=0.01,
    logging_steps=10,
    #max_steps=1
    max_steps=100
)

trainer = Trainer(
    #model=original_model,
    model = instruct_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)
trainer.train()

Step,Training Loss
10,0.2199
20,0.2479
30,0.213
40,0.2253
50,0.1963
60,0.2138
70,0.221
80,0.2266
90,0.2145
100,0.2321


TrainOutput(global_step=100, training_loss=0.22103515625, metrics={'train_runtime': 18.8542, 'train_samples_per_second': 42.431, 'train_steps_per_second': 5.304, 'total_flos': 547805881958400.0, 'train_loss': 0.22103515625, 'epoch': 0.06})