In [5]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [6]:
# Load BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [7]:
# LOAD the DATA (the output DS from K-means clustering)
file_path = 'categorized_dataset_k5_with_names.csv'
df = pd.read_csv(file_path)

# Data cleaning: Convert reviews.text to lowercase and remove nulls
df['reviews.text'] = df['reviews.text'].astype(str).str.lower()
df = df[df['reviews.text'].notnull()]

# Group all reviews under each category_name
grouped_reviews = df.groupby('category_name')['reviews.text'].apply(lambda texts: ' '.join(texts)).reset_index()

print(grouped_reviews.shape)
print(grouped_reviews.head())

(5, 2)
        category_name                                       reviews.text
0           Fire HD 8  this product so far has not disappointed. my c...
1           Fire KIDS  the tablet is very light and streams well. i o...
2       Fire Tablet 7  good basic tablet for checking email , web bro...
3              Kindle  very lightweight and portable with excellent b...
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...


In [8]:
# VERSION 2 to make Summary, Highlight and Issues mutually exclusive.

# Function to generate summary, highlights, and issues with distinct instructions
def generate_summary_highlights_issues(text, category_name):
    # Generate blog-style summary
    summary_prompt = f"Generate a summary for a blog that covers the specifications, features, and configurations of a product in the category: {category_name}. Here are the reviews: " + text
    summary_ids = model.generate(tokenizer.encode(summary_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=500, num_beams=2, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Generate highlights focusing on positive aspects not in the summary
    # highlight_prompt = f"Generate 3-4 positive points that are not covered in the summary: {summary}. Here are the reviews: " + text
    highlight_prompt = f"Generate 3-4 bullet points of positive features and advantages over competitors that are not mentioned in the summary: {summary}. Focus on product strengths and customer satisfaction for the product in the category: {category_name}. Here are the reviews: " + text
    highlights_ids = model.generate(tokenizer.encode(highlight_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=150, num_beams=2, length_penalty=1.5, early_stopping=True)
    highlights = tokenizer.decode(highlights_ids[0], skip_special_tokens=True)

    # Generate issues focusing on negative aspects not in the summary or highlights
    # issue_prompt = f"Generate 3-4 negative points that are not covered in the summary: {summary} or highlights: {highlights}. Here are the reviews: " + text
    issue_prompt = f"Generate 2-3 bullet points of negative features or issues with the product that are not mentioned in the summary: {summary} or highlights: {highlights}. Focus on customer complaints, problems, or disadvantages compared to competitors for the product in the category: {category_name}. Here are the reviews: " + text
    issues_ids = model.generate(tokenizer.encode(issue_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=100, num_beams=2, length_penalty=1.5, early_stopping=True)
    issues = tokenizer.decode(issues_ids[0], skip_special_tokens=True)

    return summary, highlights, issues

# Apply the function to each category

grouped_reviews['blog_summary'], grouped_reviews['highlights'], grouped_reviews['issues'] = zip(*grouped_reviews.apply(
    lambda row: generate_summary_highlights_issues(row['reviews.text'], row['category_name']), axis=1))

print(grouped_reviews.head())

        category_name                                       reviews.text  \
0           Fire HD 8  this product so far has not disappointed. my c...   
1           Fire KIDS  the tablet is very light and streams well. i o...   
2       Fire Tablet 7  good basic tablet for checking email , web bro...   
3              Kindle  very lightweight and portable with excellent b...   
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...   

                                        blog_summary  \
0  i love being able to easily access all of the ...   
1  i only use it to stream movies and it's much l...   
2  i love this tablet as much as my firsdt one! e...   
3  The Kindle Oasis is very lightweight and porta...   
4   echo dot is great at voice recognition - you ...   

                                          highlights  \
0  i love being able to easily access all of the ...   
1  i only use it to stream movies and it's much l...   
2  i love this tablet as much as my fi

In [9]:
# Save the output to an HTML file
with open('BART_summaries_prefinal.html', 'w') as f:
    f.write('<html><body>')
    
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Product: {row['category_name']}</h2>")
        
        f.write("<h3>Summary</h3>")
        f.write(f"<p>{row['blog_summary']}</p>")
        
        f.write("<h3>Highlights</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['highlights'].split('. '))}</li></ul>")
        
        f.write("<h3>Issues</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['issues'].split('. '))}</li></ul>")
        
        f.write('<hr>')
    
    f.write('</body></html>')

In [10]:
# Save the model and tokenizer
model.save_pretrained("./summarizer-BART_Prefinal_v2")
tokenizer.save_pretrained("./summarizer-BART_Prefinal_v2")

('./summarizer-BART_Prefinal_v2/tokenizer_config.json',
 './summarizer-BART_Prefinal_v2/special_tokens_map.json',
 './summarizer-BART_Prefinal_v2/vocab.json',
 './summarizer-BART_Prefinal_v2/merges.txt',
 './summarizer-BART_Prefinal_v2/added_tokens.json')