In [2]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Step 1: Load the dataset and drop irrelevant columns
columns_to_drop = [
    'id', 'dateAdded', 'dateUpdated', 'asins', 'brand', 'imageURLs', 'keys',
    'manufacturer', 'manufacturerNumber', 'reviews.id', 'reviews.date',
    'reviews.dateSeen', 'sourceURLs',
    'reviews.doRecommend',  'reviews.numHelpful', 
    'reviews.sourceURLs', 'reviews.username'
]

# Load the dataset
df = pd.read_csv('./dataset/output_dataset_with_sentiment.csv')

# Drop irrelevant columns
df = df.drop(columns=columns_to_drop)

# Display the remaining columns
print("Remaining columns in the dataset:", df.columns)

# Step 2: Group reviews by product category and concatenate the review texts
grouped_reviews = df.groupby(['sentiment','categories'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()
grouped_reviews.columns = ['sentiment', 'category', 'combined_reviews']

# Step 3: Set up the pre-trained BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Step 4: Function to summarize reviews
def summarize_text(text, max_length=130, min_length=30):
    # Tokenize and truncate the input text if necessary
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs, 
        max_length=max_length, 
        min_length=min_length, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )

    # Decode the summary and return it
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Step 5: Generate summaries for each product category
summaries = []
for _, row in grouped_reviews.iterrows():
    print(f"Generating summary for category: {row['category']}")
    summary = summarize_text(row['combined_reviews'])
    summaries.append(summary)

# Add the summaries to the DataFrame
grouped_reviews['summary'] = summaries

# Step 6: Save the summaries to a CSV file
output_file = 'summarized_reviews.csv'
grouped_reviews.to_csv(output_file, index=False)
print(f"Summaries generated and saved to {output_file}")

Remaining columns in the dataset: Index(['name', 'categories', 'primaryCategories', 'reviews.didPurchase',
       'reviews.rating', 'reviews.text', 'reviews.title', 'sentiment'],
      dtype='object')
Generating summary for category: AA,AAA,Electronics Features,Health,Electronics,Health & Household,Camcorder Batteries,Camera & Photo,Batteries,Household Batteries,Accessories,Camera Batteries,Health and Beauty,Household Supplies,Batteries & Chargers,Health, Household & Baby Care,Health Personal Care
Generating summary for category: AA,AAA,Health,Electronics,Health & Household,Camcorder Batteries,Camera & Photo,Batteries,Household Batteries,Robot Check,Accessories,Camera Batteries,Health and Beauty,Household Supplies,Batteries & Chargers,Health, Household & Baby Care,Health Personal Care
Generating summary for category: Amazon Echo,Home Theater & Audio,MP3 MP4 Player Accessories,Smart Speakers,Electronics,Portable Audio,Compact Radios Stereos,Smart Hubs & Wireless Routers,Featured Brands,