# Uploading the Dataset from the Kaggle
The dataset is amazon-product-reviews and the link of the dataset is :-  https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews



In [1]:
from google.colab import files
files.upload()  # Upload the kaggle.json file
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Secure the API key
!pip install kaggle
!kaggle datasets list
!kaggle datasets download -d arhamrumi/amazon-product-reviews
import zipfile

# Unzip the dataset
!unzip amazon-product-reviews.zip -d amazon_reviews


Saving kaggle.json to kaggle.json
ref                                                        title                                                  size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  -----------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
adilshamim8/student-depression-dataset                     Student Depression Dataset                           467020  2025-03-13 03:12:30.423000          15414        243  1.0              
zahidmughal2343/amazon-sales-2025                          Amazon Sales 2025                                      3617  2025-04-03 22:08:13.607000           2613         40  1.0              
atharvasoundankar/chocolate-sales                          Chocolate Sales Data 📊🍫                               14473  2025-03-19 03:51:40.270000          23296        369  1.0              
atharv

In [2]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f8a9f80692a99c322e85a178f8a384ecb15af722552b611e6e0f9111f7aeee0a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
!pip install -U transformers


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.3


In [4]:
!pip install datasets
!pip install evaluate
!pip install warnings

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

# Importing the Libraries

In [5]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
import evaluate
from datasets import Dataset
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

**Download** ALL necessary NLTK data with explicit download commands

In [6]:
print("Downloading NLTK data...")
nltk.download('punkt')

Downloading NLTK data...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Check** for GPU availability

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


# Load the dataset

In [8]:
print("Loading dataset...")
df = pd.read_csv("/content/amazon_reviews/Reviews.csv", usecols=["Text", "Summary"]).dropna()
print(f"Total dataset size: {len(df)} rows")

Loading dataset...
Total dataset size: 568427 rows


**Clean** and preprocess text

In [9]:
def clean_text(text):
    """Clean text by removing HTML tags and extra whitespace"""
    text = re.sub(r'<br\s*/?>', ' ', text)  # Remove HTML line breaks
    text = re.sub(r'\s+', ' ', text)        # Replace multiple spaces with single space
    return text.strip()

# Preprocess the dataset

In [10]:
print("Preprocessing dataset...")
df['Text'] = df['Text'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

Preprocessing dataset...


**Remove** very short reviews and summaries (likely noise)

In [11]:
df = df[(df['Text'].str.split().str.len() >= 5) &
        (df['Summary'].str.split().str.len() >= 2)]

# Take only 10000 rows for training

In [12]:
sampled_df = df.sample(n=10000, random_state=42)

# Split into train, validation, and test sets

In [13]:
train_df, temp_df = train_test_split(sampled_df, test_size=4000, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=2000, random_state=42)

print(f"Training set: {len(train_df)} rows")
print(f"Validation set: {len(val_df)} rows")
print(f"Test set: {len(test_df)} rows")

Training set: 6000 rows
Validation set: 2000 rows
Test set: 2000 rows


# Choossing the Bart-Large model which is a smaller, faster model that will be easier to fine-tune with limited data

In [14]:
model_name = "facebook/bart-base" # Smaller than bart-large models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

**Maximum** lengths for input and output (reduced to save memory)

In [15]:
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 48

**Prepare** the dataset for training

In [16]:
def preprocess_function(examples):
    inputs = examples["Text"]
    targets = examples["Summary"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

**Convert** dataframes to HuggingFace datasets

In [17]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

**Apply** preprocessing

In [18]:
print("Tokenizing datasets...")
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)

Tokenizing datasets...


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [19]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [20]:
# Metric for evaluation
rouge = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

**Custom** function for sentence splitting

In [21]:
# Custom function for sentence splitting that doesn't rely on nltk's sent_tokenize
def custom_sent_tokenize(text):
    """A simple sentence tokenizer that splits on common sentence terminators"""
    # Split the text on common sentence terminators
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # Remove empty sentences
    return [sent for sent in sentences if sent.strip()]

def compute_metrics(eval_pred):
    # Get predictions and labels
    predictions, labels = eval_pred

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode references
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Use our custom tokenizer instead of nltk's
    decoded_preds = ["\n".join(custom_sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(custom_sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Add mean generated length
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

**Training** arguments and Trainer

In [22]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./amazon-review-summarizer-small",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    report_to="none"
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


# Fine-tune the model

In [23]:
print("Fine-tuning the model...")
trainer.train()

Fine-tuning the model...


Step,Training Loss
500,3.3414
1000,2.8667
1500,2.6053
2000,2.1798
2500,2.0203
3000,1.8725


TrainOutput(global_step=3000, training_loss=2.4809959920247397, metrics={'train_runtime': 631.9123, 'train_samples_per_second': 37.98, 'train_steps_per_second': 4.747, 'total_flos': 2668692624998400.0, 'train_loss': 2.4809959920247397, 'epoch': 4.0})

# Save the fine-tuned model

In [24]:
model_path = "./amazon-review-summarizer-small-final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")


Model saved to ./amazon-review-summarizer-small-final


Evaluate on the test set - using a direct approach to avoid **OverflowError**

In [25]:
# Evaluate on the test set - using a direct approach to avoid OverflowError
print("Evaluating on test set...")
test_dataset = Dataset.from_pandas(test_df)
test_tokenized = test_dataset.map(preprocess_function, batched=True)


Evaluating on test set...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Function for generating summaries

In [26]:
def generate_summary(text, max_length=MAX_TARGET_LENGTH):
    # Clean the text
    cleaned_text = clean_text(text)

    # Tokenize
    inputs = tokenizer(cleaned_text, max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate summary
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs.get("attention_mask", None),
            max_length=max_length,
            min_length=10,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


**Manually** evaluate on test set

In [27]:
print("Generating summaries for test set...")
model.to(device)
model.eval()

test_summaries = []
rouge_scores = []

for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    # Generate summary
    summary = generate_summary(row['Text'])

    # Store results
    test_summaries.append({
        'Text': row['Text'][:200] + '...' if len(row['Text']) > 200 else row['Text'],
        'Actual_Summary': row['Summary'],
        'Generated_Summary': summary
    })

    # Calculate ROUGE score for this example using our custom tokenizer
    pred_sentences = "\n".join(custom_sent_tokenize(summary.strip()))
    ref_sentences = "\n".join(custom_sent_tokenize(row['Summary'].strip()))

    score = rouge.compute(
        predictions=[pred_sentences],
        references=[ref_sentences],
        use_stemmer=True
    )
    rouge_scores.append(score)

Generating summaries for test set...


  0%|          | 0/2000 [00:00<?, ?it/s]

**Create** results dataframe

In [34]:
test_results_df = pd.DataFrame(test_summaries)
test_results_df.to_csv("amazon_review_test_summaries.csv", index=False)

**Calculate** average ROUGE scores and additional metrices and metrices

In [35]:
# Calculate average ROUGE scores
avg_rouge = {key: np.mean([score[key] for score in rouge_scores]) for key in rouge_scores[0].keys()}
print("\nTest ROUGE Scores:")
for k, v in avg_rouge.items():
    print(f"{k}: {v:.4f}")

# Calculate additional metrics
def calculate_length_accuracy(row):
    pred_len = len(row['Generated_Summary'].split())
    ref_len = len(row['Actual_Summary'].split())
    return 1 - min(abs(pred_len - ref_len) / max(ref_len, 1), 1)  # Bound between 0 and 1

def calculate_token_overlap(row):
    pred_tokens = set(row['Generated_Summary'].lower().split())
    ref_tokens = set(row['Actual_Summary'].lower().split())
    if not ref_tokens:
        return 0
    return len(pred_tokens.intersection(ref_tokens)) / len(ref_tokens)

# Calculate metrics
test_results_df['Length_Accuracy'] = test_results_df.apply(calculate_length_accuracy, axis=1)
test_results_df['Token_Overlap'] = test_results_df.apply(calculate_token_overlap, axis=1)

# Print average metrics
avg_length_accuracy = test_results_df['Length_Accuracy'].mean()
avg_token_overlap = test_results_df['Token_Overlap'].mean()
print(f"\nAverage Length Accuracy: {avg_length_accuracy:.4f}")
print(f"Average Token Overlap (simple accuracy): {avg_token_overlap:.4f}")


Test ROUGE Scores:
rouge1: 0.1673
rouge2: 0.0555
rougeL: 0.1590
rougeLsum: 0.1599

Average Length Accuracy: 0.3489
Average Token Overlap (simple accuracy): 0.1754


# Sample review function for demonstration

In [36]:
def summarize_review(review_text):
    """
    Function to summarize a product review

    Parameters:
    review_text (str): The product review text to summarize

    Returns:
    str: Concise summary of the review
    """
    return generate_summary(review_text)

In [37]:
# Show examples from the test set
print("\nExample summaries:")
for i in range(min(5, len(test_results_df))):
    print(f"\nReview {i+1}: {test_results_df.iloc[i]['Text']}")
    print(f"Actual summary: {test_results_df.iloc[i]['Actual_Summary']}")
    print(f"Generated summary: {test_results_df.iloc[i]['Generated_Summary']}")


Example summaries:

Review 1: The shipment arrived on time.The Keurig bundle is a perfect gif for coffee lovers.It contains aside assortment of flavors and blends
Actual summary: Keurig Coffee
Generated summary: Great Gift for Coffee Lovers and Coffee Drinkers

Review 2: I was pleased with the price of these breakfast "cookies" since I generally try to find a larger granola-bar-type item to each each morning. The cookie is a good size, and it kept me full until lunch....
Actual summary: Breakfast Cookie - wish it was organic
Generated summary: Good, but not as good as I would like

Review 3: I have tried a lot of different arthritis supplements for my small dogs, but this one is the best product I have found for the price. My dogs love it and it is wonderful to see them running around whe...
Actual summary: Best Product for the Price!
Generated summary: Best arthritis supplement I have found for the price

Review 4: The tea is very good and tasty. As soon as you open the bag, you can 

# Example of generating a summary for a new review

In [40]:
# Example of generating a summary for a new review
print("\nExample of generating a summary for a new review:")
example_review = """
I recently purchased this wireless speaker, and I’m impressed with the sound quality! The bass is deep, and the treble is clear. It pairs easily with my phone, and the Bluetooth range is excellent. The battery life is long-lasting, and it charges quickly. It’s also lightweight and portable, making it perfect for taking on the go. I love using it for outdoor gatherings!
"""

summary = summarize_review(example_review)
print(f"Input review: {example_review.strip()}")
print(f"Generated summary: {summary}")

print("\nProcessing complete!")


Example of generating a summary for a new review:
Input review: I recently purchased this wireless speaker, and I’m impressed with the sound quality! The bass is deep, and the treble is clear. It pairs easily with my phone, and the Bluetooth range is excellent. The battery life is long-lasting, and it charges quickly. It’s also lightweight and portable, making it perfect for taking on the go. I love using it for outdoor gatherings!
Generated summary: Great sound, portable and great for outdoor gatherings

Processing complete!
