# Building a Summarizer for Product Reviews
* Build an LLM that can generate concise and informative summaries for product reviews. 


In [1]:
import time
import numpy as np
import pandas as pd

#Hugging Face
import torch
import evaluate
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer,Seq2SeqTrainingArguments, Seq2SeqTrainer

1. Extractive summarization - most important sentences from the original text are selected and combined to form a summary. Transformers in this case are used to process the text, extract features and perform sentence ranking
2. Abstractive summarization - a new summary is generated by understanding the context of the original text and generating new phrases and sentences that summarise its content e.g encoder-decoder models

Loading datasets

In [2]:
dataset = load_dataset('../aws_data',data_files={'train':'train.csv','val':'val.csv','test':'test.csv'},sep='\t')
dataset

DatasetDict({
    train: Dataset({
        features: ['cat', 'group_id', 'rev1', 'rev2', 'rev3', 'rev4', 'rev5', 'rev6', 'rev7', 'rev8', 'summ1', 'summ2', 'summ3', 'rating1', 'rating2', 'rating3', 'rating4', 'rating5', 'rating6', 'rating7', 'rating8'],
        num_rows: 28
    })
    test: Dataset({
        features: ['cat', 'group_id', 'rev1', 'rev2', 'rev3', 'rev4', 'rev5', 'rev6', 'rev7', 'rev8', 'summ1', 'summ2', 'summ3', 'rating1', 'rating2', 'rating3', 'rating4', 'rating5', 'rating6', 'rating7', 'rating8'],
        num_rows: 20
    })
    val: Dataset({
        features: ['cat', 'group_id', 'rev1', 'rev2', 'rev3', 'rev4', 'rev5', 'rev6', 'rev7', 'rev8', 'summ1', 'summ2', 'summ3', 'rating1', 'rating2', 'rating3', 'rating4', 'rating5', 'rating6', 'rating7', 'rating8'],
        num_rows: 12
    })
})

In [3]:
print(dataset['train']['rev1'][0])
print(dataset['train']['summ1'][0])

This pendant is so unique!! The design is beautiful and the bail is a ring instead of the typical bail which gives it a nice touch!! All the corners are smooth and my daughter loves it - looks great on her.I cannot say anything about the chain because used our own chain.:) Satisfied.
This silver chain and pendant are elegant and unique. The necklace is very well made, making it a great buy for the cost, and is of high enough quality to be worn every day. The necklace looks beautiful when worn bringing many compliments. Overall, it is highly recommended.


Transform features

In [4]:
rating_columns = [f"rating{i+1}" for i in range(8)]
rev_columns = [f"rev{i+1}" for i in range(8)]

def merge_reviews(example):
    #merge reviews into one column
    example['reviews'] = '\n'.join(str(example[col]) for col in rev_columns)
    return example

#randomly combine summaries into one column, random to reduce bias
def combine_summaries(row):
    summaries = [row["summ1"], row["summ2"], row["summ3"]]
    np.random.seed(42)
    return np.random.choice(summaries)

In [5]:
# Use the map function to apply the merge_reviews and combine_summaries functions to each example in each split
clean_dataset = dataset.map(merge_reviews)
clean_dataset = clean_dataset.remove_columns(rev_columns+rating_columns)
clean_dataset = clean_dataset.map(lambda example: {"summaries":combine_summaries(example)}, remove_columns=["cat","group_id","summ1", "summ2", "summ3"])
clean_dataset

DatasetDict({
    train: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 28
    })
    test: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 20
    })
    val: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 12
    })
})

In [6]:
clean_dataset['train']['summaries'][2]

'These are very comfortable, quality shoes that can be worn casually. They are functional and incredibly light-weight. However, the shoes tend to run a bit large. Overall, this quality product is recommended.'

Zero Shot Inferencing

In [7]:
#model_name='google/flan-t5-base'
model_name = "facebook/bart-large-cnn"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
review = clean_dataset['train']['reviews'][10]
summary = clean_dataset['train']['summaries'][10]

prompt = f"""
Summarize the following product reviews.

{review}

Summary:"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'], max_new_tokens = 200)[0],
        skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')


---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following product reviews.

Took a long time to get and when I finally got them, they didn't seem to work to well joined up (I have 3). Also, they should have come with DC chargers, they don't seem to work well with batteries, imho.
The units work fine and the setup is described in the manual works well. The one thing that my wife and I do not like is the Audio. Sounds like you are in a box. The calling button is also a bit strange. It first beeps and then you can speak. The distance it covers is very good.
Don't waste your money on these. Although if you try hard enough you can make out what the other person is saying, the audio quality is extremely poor, as is the noise cancellation (if any), and the general sound quality. I wish I had saved the boxes. I'd be sending these back.
And the problem is having to wake the kids up in the morning to go to school. T

Zero-shot (BART) doesn't do a great job at summarizing the reviews, if anything, it just copied and oasted the second review

Preprocess the Dataset

In [9]:
start_prompt = 'Summarize the following product reviews:\n\n'
end_prompt = "\n\nSummary: "
def tokenize_function(examples):
    inputs = [start_prompt + rev + end_prompt for rev in examples["reviews"]]
    model_inputs = tokenizer(inputs, max_length= 1024, padding='max_length', truncation=True)
    model_inputs["labels"] = tokenizer(text_target=examples["summaries"],max_length=128, padding='max_length', truncation=True).input_ids
    #labels = tokenizer(text_target=examples["summaries"], max_length=128, truncation=True)
    #model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_data = clean_dataset.map(tokenize_function, batched=True)
tokenized_data = tokenized_data.remove_columns(['reviews','summaries'])
tokenized_data

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 28
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12
    })
})

Fine-Tune the Model with the preprocessed dataset

In [11]:
from transformers import DataCollatorForSeq2Seq

#Data collators are objects that will form a batch by using a list of dataset elements as input. 
#To be able to build batches, data collators may apply some processing (like padding).
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [12]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="revs_summarizer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 4.831182479858398, 'eval_rouge1': 0.3711, 'eval_rouge2': 0.1033, 'eval_rougeL': 0.2337, 'eval_rougeLsum': 0.2353, 'eval_gen_len': 79.0, 'eval_runtime': 312.7734, 'eval_samples_per_second': 0.038, 'eval_steps_per_second': 0.01, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.7244367599487305, 'eval_rouge1': 0.3884, 'eval_rouge2': 0.1144, 'eval_rougeL': 0.2474, 'eval_rougeLsum': 0.2481, 'eval_gen_len': 79.6667, 'eval_runtime': 322.8665, 'eval_samples_per_second': 0.037, 'eval_steps_per_second': 0.009, 'epoch': 2.0}
{'train_runtime': 1918.0751, 'train_samples_per_second': 0.029, 'train_steps_per_second': 0.007, 'train_loss': 5.267559051513672, 'epoch': 2.0}


TrainOutput(global_step=14, training_loss=5.267559051513672, metrics={'train_runtime': 1918.0751, 'train_samples_per_second': 0.029, 'train_steps_per_second': 0.007, 'train_loss': 5.267559051513672, 'epoch': 2.0})

In [15]:
model.save_pretrained("../models/bart_summ_model")
tokenizer.save_pretrained("../models/bart_summ_tokenizer")

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('../models/bart_summ_tokenizer\\tokenizer_config.json',
 '../models/bart_summ_tokenizer\\special_tokens_map.json',
 '../models/bart_summ_tokenizer\\vocab.json',
 '../models/bart_summ_tokenizer\\merges.txt',
 '../models/bart_summ_tokenizer\\added_tokens.json',
 '../models/bart_summ_tokenizer\\tokenizer.json')

In [16]:
clean_dataset

DatasetDict({
    train: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 28
    })
    test: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 20
    })
    val: Dataset({
        features: ['reviews', 'summaries'],
        num_rows: 12
    })
})

Inference

Human Evaluation

In [17]:
reviews = clean_dataset['test'][0:10]['reviews']
human_baseline_summaries = clean_dataset['test'][0:10]['summaries']

model_summaries = []

for _, review in enumerate(reviews):
    prompt = f"""
Summarize the following product reviews.

{review}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    model_outputs = model.generate(input_ids=input_ids, max_new_tokens=100,do_sample=False)
    model_text_output = tokenizer.decode(model_outputs[0], skip_special_tokens=True)
    model_summaries.append(model_text_output)

zipped_summaries = list(zip(reviews,human_baseline_summaries, model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['reviews','human_baseline_summaries', 'model_summaries'])
df

Unnamed: 0,reviews,human_baseline_summaries,model_summaries
0,"I chose it because it's a beautiful purse, but...","This is a beautiful purse, but it lacks durabi...","This is a great purse, but the straps are too ..."
1,These are the perfect tights for my 5-year old...,Great soft feeling fabric and beautiful color ...,These tights are great tights for ballet tight...
2,The shoes are the perfect fit for me. They sup...,"These are very stylish, well-fitting, supporti...",The Reebok Reeboks are very comfortable and ru...
3,The description say it long... NOT so it is av...,This is a good looking and comfortable tank to...,This is a basic tank top that fits well but is...
4,My son is 3 and this fits him perfectly. He'll...,This Thomas the Tank costume is perfect for sm...,This Thomas the train costume is a great fit f...
5,This product recharges my wife's iPhone 4 with...,This solar-powered charger performs inconsiste...,This solar charger works perfectly. It can be ...
6,I recently bought this film and slide scanner ...,The results with this scanner are sporadic at ...,The VuPoint Digital Scanner converts negatives...
7,"Since I purchased this backpack a month ago, I...","This backpack is compact, durable and can hold...",This camera bag is well made and is comfortabl...
8,"Yes, HP DVD's are DVD's for the better. Better...",These DVDs are good quality from a reliable br...,"This product is made by CMC, with id of CMCMAG..."
9,of the fact that an SD card is NOT included wi...,Consumers love this camera for its small size ...,This is a great camera that is easy to use and...


In [18]:
idx = 4
for col in df.columns:
    print(col)
    print(dash_line)
    print(df[col][idx])
    print(dash_line)

reviews
---------------------------------------------------------------------------------------------------
My son is 3 and this fits him perfectly. He'll probably be able to wear it for the next two years if he'd like. It's cute too. The hat is thin, but completes the outfit. And the candy pocket is huge. Perfect! I'm so glad we bought this costume over any other Thomas costume.
I ordered this for my 3 yr old for Halloween. He loved it!! The candy catcher in the front is really neat, but probably need to take a pail or something else along also because it can get to be heavy if they get a lot of candy. I was very pleased with the way it fit and everything.
Received from Toynk Toys and was very disappointed when I opened the pkg...... very flimsy felt fabric. No 3D sculpted face that I expected. I became creative and added huge googly eyes, pumpkin patch, spiders, bats, and train tracks to the outfit to make it a bit more suitable for my taste.
This is a cute costume that we got for my

In [19]:
text = clean_dataset['test']['reviews'][5]
review_prompt = start_prompt +text
print(review_prompt)

Summarize the following product reviews:

This product recharges my wife's iPhone 4 with no problem. One downside - it does not have enough juice to charge an iPad. It would benefit from some indication of the percentage charge of the battery. Also, the LED light is VERY bright.
The product is great .The issue is that it came broken and I only found out after I had it for a month. There is no was to call there support. Very very bad idea. Will do more research next time .I don't think I would buy it again!!!!
I bought this to test it out for charging my phone on AT section hikes. For the added weight, it was a nice-to-have item. I probably wouldn't have it on a through-hike, but it worked well and if you aren't going to have access to a power source for recharging your phone while you're out, it does the job pretty well.
This solar charger works perfectly. It can be charged with electricity or solar power making it a great backup for charging devices. For its size I would recommend get

In [20]:
#tokenizer = AutoTokenizer.from_pretrained("../models/summ_tokenizer")
inputs = tokenizer(review_prompt, return_tensors="pt").input_ids

#model = AutoModelForSeq2SeqLM.from_pretrained("../models/summ_model")
outputs = model.generate(inputs, max_new_tokens=80, do_sample=False)

print(review_prompt)
print(dash_line)
print('Human Summary:\n',clean_dataset['test']['summaries'][5])
print(dash_line)
print('Model Summary:\n', tokenizer.decode(outputs[0], skip_special_tokens=True))

Summarize the following product reviews:

This product recharges my wife's iPhone 4 with no problem. One downside - it does not have enough juice to charge an iPad. It would benefit from some indication of the percentage charge of the battery. Also, the LED light is VERY bright.
The product is great .The issue is that it came broken and I only found out after I had it for a month. There is no was to call there support. Very very bad idea. Will do more research next time .I don't think I would buy it again!!!!
I bought this to test it out for charging my phone on AT section hikes. For the added weight, it was a nice-to-have item. I probably wouldn't have it on a through-hike, but it worked well and if you aren't going to have access to a power source for recharging your phone while you're out, it does the job pretty well.
This solar charger works perfectly. It can be charged with electricity or solar power making it a great backup for charging devices. For its size I would recommend get

In [21]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 406290432
all model parameters: 406290432
percentage of trainable model parameters: 100.00%


Model Evaluation with ROUGE Metric

In [23]:
#compare the human generated summaries with the model summaries
rouge_metric = load_metric("rouge")

results = rouge_metric.compute(
    predictions = model_summaries,
    references = human_baseline_summaries,
    use_aggregator = True,
    use_stemmer = True
)

print('BART Model Results: \n', results)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BART Model Results: 
 {'rouge1': AggregateScore(low=Score(precision=0.2996058040231507, recall=0.4125435160476389, fmeasure=0.3479670127261747), mid=Score(precision=0.33958729071520244, recall=0.46723289597896533, fmeasure=0.38983783736963784), high=Score(precision=0.3757346822644893, recall=0.5255026904934822, fmeasure=0.42718663208307767)), 'rouge2': AggregateScore(low=Score(precision=0.06312952562021687, recall=0.08265921840434287, fmeasure=0.0720059721877162), mid=Score(precision=0.09082496023088557, recall=0.12509826572138288, fmeasure=0.10463030289492095), high=Score(precision=0.12028449871199387, recall=0.17830552220888354, fmeasure=0.1426202125379947)), 'rougeL': AggregateScore(low=Score(precision=0.17449850192537536, recall=0.2353347013393347, fmeasure=0.2001341864041679), mid=Score(precision=0.19933670746470106, recall=0.27537028494137017, fmeasure=0.2293013360437636), high=Score(precision=0.22525622845569337, recall=0.3143351405262885, fmeasure=0.25982343943412983)), 'rougeL

In [24]:
results_dict = {}
for k, v in results.items():
    results_dict[k] = {
        'precision': v[1][0],
        'recall': v[1][1],
        'fmeasure': v[1][2]
    }

results_dict


{'rouge1': {'precision': 0.33958729071520244,
  'recall': 0.46723289597896533,
  'fmeasure': 0.38983783736963784},
 'rouge2': {'precision': 0.09082496023088557,
  'recall': 0.12509826572138288,
  'fmeasure': 0.10463030289492095},
 'rougeL': {'precision': 0.19933670746470106,
  'recall': 0.27537028494137017,
  'fmeasure': 0.2293013360437636},
 'rougeLsum': {'precision': 0.1989191793271708,
  'recall': 0.27300010351873616,
  'fmeasure': 0.22886544449464924}}

In [25]:
import plotly.express as px
import plotly.graph_objects as go

In [27]:
metrics = ['precision', 'recall', 'fmeasure']

fig = go.Figure()

for metric in metrics:
    values = [results_dict[key][metric] for key in results_dict]
    fig.add_trace(go.Bar(x=list(results_dict.keys()), y=values, name=metric.capitalize()))

fig.update_layout(
    title='ROUGE Metrics: Fine-tuned BART Summarization Model',
    xaxis=dict(title='ROUGE Type'),
    yaxis=dict(title='Score'),
    barmode='group',
    template='plotly_dark'
)

fig.show()

Parametric Efficient Fine Tuning (PEFT)

* PEFT is more efficient that full fine-tuning, especially because it's less memory intensive.
* PEFT incorporates LoRA which allows a user to fine-tune their model using fewer compute resources

In [28]:
from peft import LoraConfig, get_peft_model, TaskType

In [29]:
lora_config = LoraConfig(
    r=32, # Rank - determines the dimensionality of the space
    lora_alpha=32, #controls the power of the regularization term
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [30]:
peft_model = get_peft_model(model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 4718592
all model parameters: 411009024
percentage of trainable model parameters: 1.15%


In [31]:
output_dir = f'./peft-review-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_data["train"],
)

In [32]:
peft_trainer.train()

peft_model_path="./peft-review-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

  0%|          | 0/1 [00:00<?, ?it/s]

: 