In [1]:
#!pip install transformers
#!pip install transformers huggingface_hub
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re


In [3]:
# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [4]:
# LOAD the DATA (the output DS from K-means clustering)
file_path = 'categorized_dataset_k5_with_names.csv'
df = pd.read_csv(file_path)

# Data cleaning: Convert reviews.text to lowercase and remove NULLs
df['reviews.text'] = df['reviews.text'].astype(str).str.lower()
df = df[df['reviews.text'].notnull()]

# Group all reviews under each Category 
grouped_reviews = df.groupby('category_name')['reviews.text'].apply(lambda texts: ' '.join(texts)).reset_index() 

"""
# We group by both category_name and label to ensure we retain the sentiment
grouped_reviews = df.groupby(['category_name', 'label'])['reviews.text'].apply(lambda texts: ' '.join(texts)).reset_index() 
"""

print(grouped_reviews.head())

        category_name                                       reviews.text
0           Fire HD 8  this product so far has not disappointed. my c...
1           Fire KIDS  the tablet is very light and streams well. i o...
2       Fire Tablet 7  good basic tablet for checking email , web bro...
3              Kindle  very lightweight and portable with excellent b...
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...


In [5]:
# VERSION 2
# STEP to generate SUMMARY - one Blog style large summary and one compact summary for Highlights/Issues - in HTML file for deployment in Gradio


# List of common pronouns to remove
pronouns = ['i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our', 'their', 'us', 'me', 'll', 'have']

# Function to remove pronouns
def remove_pronouns(text):
    text = re.sub(r'\b(?:{})\b'.format('|'.join(pronouns)), '', text)
    return text

# Summarization function for blog-style summaries
def generate_blog_style_summary(text):
    cleaned_text = remove_pronouns(text)
    input_text = "summarize: Focus on the product features and exclude any personal mentions. " + cleaned_text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(inputs, max_length=300, min_length=150, num_beams=6, length_penalty=2.5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Function for compact summary on highlights and issues
def generate_compact_summary(text):
    cleaned_text = remove_pronouns(text)
    input_text = "summarize: Highlight 2-4 key features and mention 2-4 issues of this product. " + cleaned_text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(inputs, max_length=100, min_length=50, num_beams=4, length_penalty=1.5, early_stopping=True)
    compact_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return compact_summary

# Generate both blog-style and compact summaries
grouped_reviews['blog_style_summary'] = grouped_reviews['reviews.text'].apply(generate_blog_style_summary)
grouped_reviews['compact_summary'] = grouped_reviews['reviews.text'].apply(generate_compact_summary)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Save blog-style and compact summaries to an HTML file
with open('product summaries_T5_v2.html', 'w') as f:
    f.write('<html><body>')
    
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Product name: {row['category_name']}</h2>")
        
        f.write("<h3>Summary</h3>")
        f.write(f"<p>{row['blog_style_summary']}</p>")
        
        f.write("<h3>Highlights & Issues</h3>")
        f.write(f"<p>{row['compact_summary']}</p>")
        f.write('<hr>')
    
    f.write('</body></html>')

In [None]:
# Save the model and tokenizer
model.save_pretrained("./summarizer-T5_large")
tokenizer.save_pretrained("./summarizer-T5_large")

In [None]:
# VERSION 4 - modified version 3 (to go back to blog level summary & separate highlighs&issues)
""" 

# List of common pronouns to remove (including 'us', 'me', etc.)
pronouns = ['i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our', 'their', 'us', 'me', 'll', 'have']

# Function to remove pronouns
def remove_pronouns(text):
    text = re.sub(r'\b(?:{})\b'.format('|'.join(pronouns)), '', text)
    return text

# Generate blog-style summary (previous version you liked)
def generate_blog_style_summary(text):
    cleaned_text = remove_pronouns(text)
    input_text = "summarize: Focus on the product's features and exclude any personal mentions. " + cleaned_text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(inputs, max_length=300, min_length=150, num_beams=6, length_penalty=2.5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Generate bullet points for highlights and issues based on labels
def generate_bullet_points(df, label):
    text = ' '.join(df[df['label'] == label]['reviews.text'].tolist())
    cleaned_text = remove_pronouns(text)
    
    if label == 2:  # Highlights
        input_text = "summarize: Provide 2-4 bullet points with product highlights. " + cleaned_text
    elif label == 0:  # Issues
        input_text = "summarize: Provide 2-4 bullet points with product issues. " + cleaned_text
    
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(inputs, max_length=150, min_length=50, num_beams=8, length_penalty=1.5, early_stopping=True)
    bullet_points = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return bullet_points

# Apply the summaries and bullet points generation
grouped_reviews['blog_style_summary'] = grouped_reviews['reviews.text'].apply(generate_blog_style_summary)
grouped_reviews['highlights'] = grouped_reviews.apply(lambda row: generate_bullet_points(df[df['category_name'] == row['category_name']], label=2), axis=1)
grouped_reviews['issues'] = grouped_reviews.apply(lambda row: generate_bullet_points(df[df['category_name'] == row['category_name']], label=0), axis=1)

# Save to HTML with bullet points for highlights and issues
with open('product_summaries_with_highlights_issues_v4.html', 'w') as f:
    f.write('<html><body>')
    
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Prodct: {row['category_name']}</h2>")
        
        f.write("<h3>Summary</h3>")
        f.write(f"<p>{row['blog_style_summary']}</p>")
        
        f.write("<h3>Highlights</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['highlights'].split('. '))}</li></ul>")
        
        f.write("<h3>Issues</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['issues'].split('. '))}</li></ul>")
        
        f.write('<hr>')
    
    f.write('</body></html>') """
