# Building a Summarizer for Product Reviews
* Build an LLM that can generate concise and informative summaries for product reviews. 


In [None]:
import time
import numpy as np
import pandas as pd

#Hugging Face
import torch
import evaluate
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer,Seq2SeqTrainingArguments, Seq2SeqTrainer

1. Extractive summarization - most important sentences from the original text are selected and combined to form a summary. Transformers in this case are used to process the text, extract features and perform sentence ranking
2. Abstractive summarization - a new summary is generated by understanding the context of the original text and generating new phrases and sentences that summarise its content e.g encoder-decoder models

Loading datasets

In [None]:
dataset = load_dataset('../aws_data',data_files={'train':'train.csv','val':'val.csv','test':'test.csv'},sep='\t')
dataset

In [None]:
print(dataset['train']['rev1'][0])
print(dataset['train']['summ1'][0])

Transform features

In [None]:
rating_columns = [f"rating{i+1}" for i in range(8)]
rev_columns = [f"rev{i+1}" for i in range(8)]

def merge_reviews(example):
    #merge reviews into one column
    example['reviews'] = '\n'.join(str(example[col]) for col in rev_columns)
    return example

#randomly combine summaries into one column, random to reduce bias
def combine_summaries(row):
    summaries = [row["summ1"], row["summ2"], row["summ3"]]
    np.random.seed(42)
    return np.random.choice(summaries)

In [None]:
# Use the map function to apply the merge_reviews and combine_summaries functions to each example in each split
clean_dataset = dataset.map(merge_reviews)
clean_dataset = clean_dataset.remove_columns(rev_columns+rating_columns)
clean_dataset = clean_dataset.map(lambda example: {"summaries":combine_summaries(example)}, remove_columns=["cat","group_id","summ1", "summ2", "summ3"])
clean_dataset

In [None]:
clean_dataset['train']['summaries'][2]

Zero Shot Inferencing

In [None]:
#model_name='google/flan-t5-base'
model_name = "facebook/bart-large-cnn"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
review = clean_dataset['train']['reviews'][10]
summary = clean_dataset['train']['summaries'][10]

prompt = f"""
Summarize the following product reviews.

{review}

Summary:"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'], max_new_tokens = 200)[0],
        skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')


Zero-shot (BART) doesn't do a great job at summarizing the reviews, if anything, it just copied and oasted the second review

Preprocess the Dataset

In [None]:
start_prompt = 'Summarize the following product reviews:\n\n'
end_prompt = "\n\nSummary: "
def tokenize_function(examples):
    inputs = [start_prompt + rev + end_prompt for rev in examples["reviews"]]
    model_inputs = tokenizer(inputs, max_length= 1024, padding='max_length', truncation=True)
    model_inputs["labels"] = tokenizer(text_target=examples["summaries"],max_length=128, padding='max_length', truncation=True).input_ids
    #labels = tokenizer(text_target=examples["summaries"], max_length=128, truncation=True)
    #model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = clean_dataset.map(tokenize_function, batched=True)
tokenized_data = tokenized_data.remove_columns(['reviews','summaries'])
tokenized_data

Fine-Tune the Model with the preprocessed dataset

In [None]:
from transformers import DataCollatorForSeq2Seq

#Data collators are objects that will form a batch by using a list of dataset elements as input. 
#To be able to build batches, data collators may apply some processing (like padding).
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="revs_summarizer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
model.save_pretrained("../models/bart_summ_model")
tokenizer.save_pretrained("../models/bart_summ_tokenizer")

In [None]:
clean_dataset

Inference

Human Evaluation

In [None]:
reviews = clean_dataset['test'][0:10]['reviews']
human_baseline_summaries = clean_dataset['test'][0:10]['summaries']

model_summaries = []

for _, review in enumerate(reviews):
    prompt = f"""
Summarize the following product reviews.

{review}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    model_outputs = model.generate(input_ids=input_ids, max_new_tokens=100,do_sample=False)
    model_text_output = tokenizer.decode(model_outputs[0], skip_special_tokens=True)
    model_summaries.append(model_text_output)

zipped_summaries = list(zip(reviews,human_baseline_summaries, model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['reviews','human_baseline_summaries', 'model_summaries'])
df

In [None]:
idx = 4
for col in df.columns:
    print(col)
    print(dash_line)
    print(df[col][idx])
    print(dash_line)

In [None]:
text = clean_dataset['test']['reviews'][5]
review_prompt = start_prompt +text
print(review_prompt)

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("../models/summ_tokenizer")
inputs = tokenizer(review_prompt, return_tensors="pt").input_ids

#model = AutoModelForSeq2SeqLM.from_pretrained("../models/summ_model")
outputs = model.generate(inputs, max_new_tokens=80, do_sample=False)

print(review_prompt)
print(dash_line)
print('Human Summary:\n',clean_dataset['test']['summaries'][5])
print(dash_line)
print('Model Summary:\n', tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

Model Evaluation with ROUGE Metric

In [None]:
#compare the human generated summaries with the model summaries
rouge_metric = load_metric("rouge")

results = rouge_metric.compute(
    predictions = model_summaries,
    references = human_baseline_summaries,
    use_aggregator = True,
    use_stemmer = True
)

print('BART Model Results: \n', results)

In [None]:
results_dict = {}
for k, v in results.items():
    results_dict[k] = {
        'precision': v[1][0],
        'recall': v[1][1],
        'fmeasure': v[1][2]
    }

results_dict


In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
metrics = ['precision', 'recall', 'fmeasure']

fig = go.Figure()

for metric in metrics:
    values = [results_dict[key][metric] for key in results_dict]
    fig.add_trace(go.Bar(x=list(results_dict.keys()), y=values, name=metric.capitalize()))

fig.update_layout(
    title='ROUGE Metrics: Fine-tuned BART Summarization Model',
    xaxis=dict(title='ROUGE Type'),
    yaxis=dict(title='Score'),
    barmode='group',
    template='plotly_dark'
)

fig.show()

Parametric Efficient Fine Tuning (PEFT)

* PEFT is more efficient that full fine-tuning, especially because it's less memory intensive.
* PEFT incorporates LoRA which allows a user to fine-tune their model using fewer compute resources

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
lora_config = LoraConfig(
    r=32, # Rank - determines the dimensionality of the space
    lora_alpha=32, #controls the power of the regularization term
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
output_dir = f'./peft-review-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_data["train"],
)

In [None]:
peft_trainer.train()

peft_model_path="./peft-review-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)