In [1]:
#!pip install transformers
#!pip install transformers huggingface_hub
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re
import torch

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:

# Load the T5-3B model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-3b')
tokenizer = T5Tokenizer.from_pretrained('t5-3b')

# Load the T5-11B model and tokenizer
#model = T5ForConditionalGeneration.from_pretrained('t5-11b')
#tokenizer = T5Tokenizer.from_pretrained('t5-11b')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-3b automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# LOAD the DATA (the output DS from K-means clustering)
file_path = 'categorized_dataset_k5_with_names.csv'
df = pd.read_csv(file_path)

# Data cleaning: Convert reviews.text to lowercase and remove NULLs
df['reviews.text'] = df['reviews.text'].astype(str).str.lower()
df = df[df['reviews.text'].notnull()]

# Group all reviews under each Category 
grouped_reviews = df.groupby('category_name')['reviews.text'].apply(lambda texts: ' '.join(texts)).reset_index() 

print(grouped_reviews.head())

        category_name                                       reviews.text
0           Fire HD 8  this product so far has not disappointed. my c...
1           Fire KIDS  the tablet is very light and streams well. i o...
2       Fire Tablet 7  good basic tablet for checking email , web bro...
3              Kindle  very lightweight and portable with excellent b...
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...


In [5]:
# Clear the CUDA cache
torch.cuda.empty_cache()

In [6]:
# VERSION 1

# Function to remove overlapping content
def remove_overlapping_content(summary, highlights, issues):
    # Convert to lowercase and remove excess spaces
    summary = summary.lower().strip()
    highlights = [highlight.lower().strip() for highlight in highlights.split('. ') if highlight not in summary]
    issues = [issue.lower().strip() for issue in issues.split('. ') if issue not in summary and issue not in highlights]
    
    return '. '.join(highlights), '. '.join(issues)

# Function to generate summary, highlights, and issues with distinct instructions
def generate_summary_highlights_issues(text, category_name):
    # Generate blog-style summary
    #summary_prompt = f"Write a blog-style summary covering the specifications, features, and configurations of a product in the category: {category_name}. Here are the reviews: " + text
    summary_prompt = f"Write a blog-style summary covering the specifications, features, and performance of a product in the category: {category_name}. Reviews: " + text
    summary_ids = model.generate(tokenizer.encode(summary_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=500, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Generate highlights focusing on positive aspects not in the summary
    #highlight_prompt = f"List 3-4 positive features and advantages over competitors that are not mentioned in the summary: {summary}. Focus on product strengths and customer satisfaction for the product in the category: {category_name}. Here are the reviews: " + text
    highlight_prompt = f"List 3-4 positive features and advantages of the product in the category: {category_name}. Reviews: " + text
    highlights_ids = model.generate(tokenizer.encode(highlight_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=150, num_beams=4, length_penalty=1.5, early_stopping=True)
    highlights = tokenizer.decode(highlights_ids[0], skip_special_tokens=True)

    # Generate issues focusing on negative aspects not in the summary or highlights
    #issue_prompt = f"List 2-3 issues or disadvantages with the product that are not mentioned in the summary: {summary} or highlights: {highlights}. Focus on customer complaints, problems, or disadvantages compared to competitors for the product in the category: {category_name}. Here are the reviews: " + text
    issue_prompt = f"List 2-3 issues or disadvantages of the product in the category: {category_name}. Reviews: " + text
    issues_ids = model.generate(tokenizer.encode(issue_prompt, return_tensors="pt", max_length=1024, truncation=True), max_length=100, num_beams=4, length_penalty=1.5, early_stopping=True)
    issues = tokenizer.decode(issues_ids[0], skip_special_tokens=True)

    # Remove overlaps between sections
    highlights, issues = remove_overlapping_content(summary, highlights, issues)

    return summary, highlights, issues

# Apply the function to each category
grouped_reviews['blog_summary'], grouped_reviews['highlights'], grouped_reviews['issues'] = zip(*grouped_reviews.apply(
    lambda row: generate_summary_highlights_issues(row['reviews.text'], row['category_name']), axis=1))

# Inspect the generated summaries
print(grouped_reviews.head())


        category_name                                       reviews.text  \
0           Fire HD 8  this product so far has not disappointed. my c...   
1           Fire KIDS  the tablet is very light and streams well. i o...   
2       Fire Tablet 7  good basic tablet for checking email , web bro...   
3              Kindle  very lightweight and portable with excellent b...   
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...   

                                        blog_summary  \
0  .: Fire HD 8.: Fire HD 8. Write a blog-style s...   
1  KIDS.               i   . i love it. i love it...   
2  this is a great tablet for the price. i got it...   
3  the kindle oasis is the smallest and lightest ...   
4  alexa is a great assistant. i use it all the t...   

                                          highlights  \
0  the kindle fire hd 8 is a great tablet. it's s...   
1                                                      
2                                     

In [7]:
# Save blog-style and compact summaries to an HTML file
with open('T5_3B_summaries_v2.html', 'w') as f:
    f.write('<html><body>')
    
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Product: {row['category_name']}</h2>")
        
        f.write("<h3>Summary</h3>")
        f.write(f"<p>{row['blog_summary']}</p>")
        
        f.write("<h3>Highlights</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['highlights'].split('. '))}</li></ul>")
        
        f.write("<h3>Issues</h3>")
        f.write(f"<ul><li>{'</li><li>'.join(row['issues'].split('. '))}</li></ul>")
        
        f.write('<hr>')
    
    f.write('</body></html>')

In [8]:
# Save the model and tokenizer
model.save_pretrained("./summarizer-T5_3B")
tokenizer.save_pretrained("./summarizer-T5_3B")

('./summarizer-T5_3B/tokenizer_config.json',
 './summarizer-T5_3B/special_tokens_map.json',
 './summarizer-T5_3B/spiece.model',
 './summarizer-T5_3B/added_tokens.json')

In [None]:
!pip install gradio


In [None]:
import gradio as gr

# Function to load and return the HTML content
def display_html():
    with open("your_output.html", "r", encoding="utf-8") as file:
        html_content = file.read()
    return html_content

# Create Gradio interface with HTML component
demo = gr.Interface(fn=display_html, inputs=[], outputs=gr.HTML())

# Launch the app
demo.launch()

In [None]:
import gradio as gr

# Function to dynamically generate HTML based on input
def generate_html(category):
    # Replace this with the actual HTML generation logic
    html_output = f"<h1>Product Category: {category}</h1><p>Generated blog content goes here...</p>"
    return html_output

# Create Gradio interface with a dropdown and HTML output
demo = gr.Interface(fn=generate_html, inputs=gr.Dropdown(choices=["Category 1", "Category 2", "Category 3"]), outputs=gr.HTML())

# Launch the app
demo.launch()