# Summarizer with T5-Base model!

In [1]:
#!pip install transformers
#!pip install transformers huggingface_hub
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Load the T5-3B model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# LOAD the DATA (the output DS from K-means clustering)
file_path = "categorized_dataset_k5_with_names.csv"
df = pd.read_csv(file_path)

# Data cleaning: Convert reviews.text to lowercase and remove NULLs
df["reviews.text"] = df["reviews.text"].astype(str).str.lower()
df = df[df["reviews.text"].notnull()]

# Group all reviews under each Category
grouped_reviews = (
    df.groupby("category_name")["reviews.text"]
    .apply(lambda texts: " ".join(texts))
    .reset_index()
)

print(grouped_reviews.head())

        category_name                                       reviews.text
0           Fire HD 8  this product so far has not disappointed. my c...
1           Fire KIDS  the tablet is very light and streams well. i o...
2       Fire Tablet 7  good basic tablet for checking email , web bro...
3              Kindle  very lightweight and portable with excellent b...
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...


In [5]:
# Clear the CUDA cache
torch.cuda.empty_cache()

In [6]:
# FROM T5 BASE

# List of common pronouns to remove
pronouns = [
    "i",
    "you",
    "he",
    "she",
    "we",
    "they",
    "my",
    "your",
    "his",
    "her",
    "our",
    "their",
    "us",
    "me",
    "ll",
    "have",
]


# Function to remove pronouns
def remove_pronouns(text):
    text = re.sub(r"\b(?:{})\b".format("|".join(pronouns)), "", text)
    return text


# Summarization function for blog-style summaries
def generate_blog_style_summary(text):
    cleaned_text = remove_pronouns(text)
    input_text = (
        "summarize: write a blog-style summary about the product features and exclude any personal mentions. "
        + cleaned_text
    )
    inputs = tokenizer.encode(
        input_text, return_tensors="pt", max_length=512, truncation=True
    )

    summary_ids = model.generate(
        inputs,
        max_length=300,
        min_length=150,
        num_beams=6,
        length_penalty=2.5,
        early_stopping=True,
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary


# Generate both blog-style summary
grouped_reviews["blog_style_summary"] = grouped_reviews["reviews.text"].apply(
    generate_blog_style_summary
)

In [7]:
print(grouped_reviews.head())

        category_name                                       reviews.text  \
0           Fire HD 8  this product so far has not disappointed. my c...   
1           Fire KIDS  the tablet is very light and streams well. i o...   
2       Fire Tablet 7  good basic tablet for checking email , web bro...   
3              Kindle  very lightweight and portable with excellent b...   
4  Speakers/Streaming  i really enjoy the echo. i got an echo dot and...   

                                  blog_style_summary  
0  amazon fire 8 inch tablet is great for e-readi...  
1  this is the second amazon fire 7 tablet purcha...  
2  great basic tablet for checking email, web bro...  
3  the kindle oasis is very tiny, portable & fits...  
4  the echo dot has the same capability as the fu...  


In [8]:
# Save the final summaries to a CSV file
# grouped_reviews.to_csv("T5-base_summary_prefinal.csv", index=False)

# Write the summaries to an HTML file
with open("T5-base_summary_prefinal_1.html", "w") as f:
    for index, row in grouped_reviews.iterrows():
        f.write(f"<h2>Product: {row['category_name']}</h2>\n")
        f.write(f"<p>{row['blog_style_summary']}</p>\n")
        f.write("<hr>\n")

In [9]:
# Save the model and tokenizer
model.save_pretrained("./summarizer-T5_Base_Prefinal")
tokenizer.save_pretrained("./summarizer-T5_Base_Prefinal")

('./summarizer-T5_Base_Prefinal/tokenizer_config.json',
 './summarizer-T5_Base_Prefinal/special_tokens_map.json',
 './summarizer-T5_Base_Prefinal/spiece.model',
 './summarizer-T5_Base_Prefinal/added_tokens.json')

In [10]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re
import torch

# Load T5-Base model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
# List of common pronouns to remove
pronouns = [
    "i",
    "you",
    "he",
    "she",
    "we",
    "they",
    "my",
    "your",
    "his",
    "her",
    "our",
    "their",
    "us",
    "me",
    "ll",
    "have",
]


def remove_pronouns(text, pronouns):
    # Remove pronouns using a regular expression
    pattern = r"\b(?:" + "|".join(pronouns) + r")\b"
    return re.sub(pattern, "", text, flags=re.IGNORECASE)


def generate_summary(text, prompt="summarize:"):
    # Clean the text by removing pronouns
    text = remove_pronouns(text, pronouns)

    # Create input prompt for T5
    input_text = f"{prompt} {text}"
    inputs = tokenizer.encode(
        input_text, return_tensors="pt", max_length=1024, truncation=True
    )
    summary_ids = model.generate(
        inputs, max_length=150, num_beams=4, length_penalty=1.5, early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Function to generate general and issue summaries per category
def generate_summaries(df):
    results = []

    for category in df["category_name"].unique():
        category_df = df[df["category_name"] == category]

        # General summary (label = 2)
        # general_reviews = ' '.join(category_df['reviews.text'].tolist())
        general_reviews = " ".join(
            category_df[category_df["label"] == 2]["reviews.text"].tolist()
        )
        general_summary = generate_summary(general_reviews)

        # Issue summary (label = 0)
        issue_reviews = " ".join(
            category_df[category_df["label"] == 0]["reviews.text"].tolist()
        )
        issue_summary = generate_summary(issue_reviews, prompt="summarize the issues:")

        results.append(
            {
                "category_name": category,
                "general_summary": general_summary,
                "issues_summary": issue_summary,
            }
        )

    return pd.DataFrame(results)


# Load data for processing
df = pd.read_csv(file_path)
summaries_df = generate_summaries(df)

In [13]:
# Save to HTML
with open("T5-base_summary_prefinal_2.html", "w") as f:
    for _, row in summaries_df.iterrows():
        f.write(f"<h2>Product: {row['category_name']}</h2>\n")
        f.write(f"<h3>Highlights:</h3>\n<p>{row['general_summary']}</p>\n")
        f.write(f"<h3>Issues:</h3>\n<p>{row['issues_summary']}</p>\n")
        f.write("<hr>\n")

In [14]:
from bs4 import BeautifulSoup


# Function to load HTML content and parse it
def load_html_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    return BeautifulSoup(html_content, "html.parser")


# Load the two HTML files
general_soup = load_html_file("T5-base_summary_prefinal_1.html")  # General summaries
issues_soup = load_html_file("T5-base_summary_prefinal_2.html")  # Highlights and issues

# Create a dictionary from the highlights and issues file
issues_dict = {}
for h2 in issues_soup.find_all("h2"):
    category = h2.get_text()
    highlights = h2.find_next("h3", text="Highlights:").find_next("p").get_text()
    issues = h2.find_next("h3", text="Issues:").find_next("p").get_text()
    issues_dict[category] = {"highlights": highlights, "issues": issues}

# Combine the content by matching category_name
final_html = "<html><body>\n"
for h2 in general_soup.find_all("h2"):
    category = h2.get_text()
    summary = h2.find_next("p").get_text()

    final_html += f"<h2>{category}</h2>\n"
    final_html += f"<h3>General Summary:</h3>\n<p>{summary}</p>\n"

    # Add highlights and issues if available
    if category in issues_dict:
        final_html += (
            f"<h3>Highlights:</h3>\n<p>{issues_dict[category]['highlights']}</p>\n"
        )
        final_html += f"<h3>Issues:</h3>\n<p>{issues_dict[category]['issues']}</p>\n"

    final_html += "<hr>\n"

final_html += "</body></html>"

# Save the combined result to a new HTML file
with open(
    "T5_Base_Final_Product_Summary.html", "w", encoding="utf-8"
) as f:  # Consolidated output HTML file
    f.write(final_html)

print("Final HTML output created successfully.")

Final HTML output created successfully.


  highlights = h2.find_next("h3", text="Highlights:").find_next("p").get_text()
  issues = h2.find_next("h3", text="Issues:").find_next("p").get_text()


In [1]:
import gradio as gr
from bs4 import BeautifulSoup


# Function to load and parse the HTML file
def load_html_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")

    products = []
    for product in soup.find_all("h2"):
        product_name = product.get_text().replace("Product: ", "").strip()
        general_summary = product.find_next("p").get_text().strip()
        highlights = product.find_next("p").find_next("p").get_text().strip()
        issues = product.find_next("p").find_next("p").find_next("p").get_text().strip()

        products.append(
            {
                "product": product_name,
                "general_summary": general_summary,
                "highlights": highlights,
                "issues": issues,
            }
        )

    return products


# Load the HTML content
html_file_path = "T5_Base_Final_Product_Summary.html"  # Consolidated File
products_data = load_html_file(html_file_path)


# Gradio function to return the summary based on the selected product
def show_product_details(product_name):
    for product in products_data:
        if product["product"] == product_name:
            return (
                product["general_summary"],
                product["highlights"],
                product["issues"],
            )
    return "Not available", "Not available", "Not available"


# Gradio interface setup
product_list = [
    product["product"] for product in products_data
]  # List of product names

gr.Interface(
    fn=show_product_details,
    inputs=gr.Dropdown(choices=product_list, label="Select Product"),
    outputs=[
        gr.Textbox(label="General Summary"),
        gr.Textbox(label="Highlights"),
        gr.Textbox(label="Issues"),
    ],
    title="Product Review Summaries",
    description="Select a product to view the general summary, highlights, and issues.",
).launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://583035d64525dd8799.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


