In [17]:
#!pip install transformers
#!pip install transformers huggingface_hub
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re
import torch

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:

# Load the T5-3B model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
# LOAD the DATA (the output DS from K-means clustering)
file_path = 'categorized_dataset_k5_with_names.csv'
df = pd.read_csv(file_path)

# Data cleaning: Convert reviews.text to lowercase and remove NULLs
df['reviews.text'] = df['reviews.text'].astype(str).str.lower()
df = df[df['reviews.text'].notnull()]

# Group all reviews under each Category 
grouped_reviews = df.groupby('category_name')['reviews.text'].apply(lambda texts: ' '.join(texts)).reset_index() 

print(grouped_reviews.head())

        category_name                                       reviews.text
0           Fire HD 8  this product so far has not disappointed. my c...
1           Fire KIDS  the tablet is very light and streams well. i o...
2       Fire Tablet 7  good basic tablet for checking email , web bro...
3              Kindle  very lightweight and portable with excellent b...
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...


In [21]:
# Clear the CUDA cache
torch.cuda.empty_cache()

In [22]:
# FROM T5 BASE - 11:30AM

# List of common pronouns to remove
pronouns = ['i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our', 'their', 'us', 'me', 'll', 'have']

# Function to remove pronouns
def remove_pronouns(text):
    text = re.sub(r'\b(?:{})\b'.format('|'.join(pronouns)), '', text)
    return text

# Summarization function for blog-style summaries
def generate_blog_style_summary(text):
    cleaned_text = remove_pronouns(text)
    input_text = "summarize: write a blog-style summary about the product features and exclude any personal mentions. " + cleaned_text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    summary_ids = model.generate(inputs, max_length=300, min_length=150, num_beams=6, length_penalty=2.5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Generate both blog-style summary
grouped_reviews['blog_style_summary'] = grouped_reviews['reviews.text'].apply(generate_blog_style_summary)


In [24]:
print(grouped_reviews.head())

        category_name                                       reviews.text  \
0           Fire HD 8  this product so far has not disappointed. my c...   
1           Fire KIDS  the tablet is very light and streams well. i o...   
2       Fire Tablet 7  good basic tablet for checking email , web bro...   
3              Kindle  very lightweight and portable with excellent b...   
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...   

                                  blog_style_summary  
0  amazon fire 8 is a great tablet for e-reading ...  
1  this is the second amazon fire 7 tablet purcha...  
2  this is a great tablet for kids 6 and older. i...  
3  the kindle oasis is the smallest of all the ki...  
4  the echo dot has the same capability as the fu...  


In [25]:
# Save the final summaries to a CSV file
grouped_reviews.to_csv("T5-base_summary_prefinal.csv", index=False)

# Write the summaries to an HTML file
with open("T5-base_summary_prefinal.html", "w") as f:
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Product: {row['category_name']}</h2>\n")
        f.write(f"<p>{row['blog_style_summary']}</p>\n")
        f.write("<hr>\n")


In [12]:
# Save the model and tokenizer
model.save_pretrained("./summarizer-T5_Base_Prefinal")
tokenizer.save_pretrained("./summarizer-T5_Base_Prefinal")

('./summarizer-T5_Base_v0/tokenizer_config.json',
 './summarizer-T5_Base_v0/special_tokens_map.json',
 './summarizer-T5_Base_v0/spiece.model',
 './summarizer-T5_Base_v0/added_tokens.json')

In [1]:
#!pip install gradio
#!pip install gradio beautifulsoup4

In [4]:
import gradio as gr
from bs4 import BeautifulSoup

# Function to load and parse the HTML file
def load_html_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    
    categories = [h2.get_text() for h2 in soup.find_all('h2')]
    summaries = [p.get_text() for p in soup.find_all('p')]
    return dict(zip(categories, summaries))

# Load the HTML content
html_file_path = "T5-base_summaries_Prefinal.html"  # Path to your HTML file
data = load_html_file(html_file_path)

# Gradio function to return the summary based on the selected category
def show_summary(category):
    return data.get(category, "Summary not available.")

# Gradio interface
category_list = list(data.keys())  # List of categories

gr.Interface(
    fn=show_summary,
    inputs=gr.Dropdown(choices=category_list, label="Select Product"),  # Updated for Gradio 3.x+
    outputs=gr.Textbox(),  # Updated for Gradio 3.x+
    title="Product Review",
    description="Select a product to see the review."
).launch(share=True)


* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://59f658175f598076ec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


