# Review Summarizer using RAG


In [1]:
import torch
import gc
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import faiss
import numpy as np
import pandas as pd
import spacy
import pytextrank
import evaluate


# Setup Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {DEVICE}")

# Model Config
RETRIEVER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GENERATOR_MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507" 


c:\Users\usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages
Running on: cuda


Loading Datasets

In [2]:
print("--- LOADING DATASETS ---")

# 1. Define Direct URLs to the raw data files
url_beauty = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/All_Beauty.jsonl"
url_music = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/Digital_Music.jsonl"

ds_beauty = load_dataset("json", data_files={"train": url_beauty}, split="train")
ds_music = load_dataset("json", data_files={"train": url_music}, split="train")

# 2. Load SAMSum (Benchmark Data)
# Only loading 'test' split to keep it lightweight
ds_samsum = load_dataset("knkarthick/samsum", split="test")

print(f"Beauty Dataset Loaded: {len(ds_beauty)} rows")
print(f"Music Dataset Loaded: {len(ds_music)} rows")
print(f"SAMSum Dataset Loaded: {len(ds_samsum)} rows")

--- LOADING DATASETS ---
Beauty Dataset Loaded: 701528 rows
Music Dataset Loaded: 130434 rows
SAMSum Dataset Loaded: 819 rows


Prepare the full external knowledge corpus

In [3]:
def prepare_full_corpus(dataset, limit=100000):
    """
    Converts dataset rows to a list of strings.
    Limits to 100k rows to keep RAM usage safe during prototyping.
    Set limit=None to use everything.
    """
    corpus = []
    for i, row in enumerate(dataset):
        if limit and i >= limit:
            break
        text = f"Rating: {row['rating']}\nTitle: {row['title']}\nReview: {row['text']}"
        corpus.append(text)
    return corpus

corpus_beauty = prepare_full_corpus(ds_beauty)
corpus_music = prepare_full_corpus(ds_music)

corpus = corpus_beauty+corpus_music
print('Corpus prepared within beauty and music')

Corpus prepared within beauty and music


Employ pre-trained embedding model and utilize a dense vector indexing to store the embeddings of the entire corpus

In [4]:
def build_faiss_index_and_clean(corpus, model_name):
    print(f"Loading Embedder {model_name} on GPU...")
    embed_model = SentenceTransformer(model_name, device=DEVICE)
    
    print(f"Encoding {len(corpus)} documents...")
    embeddings = embed_model.encode(corpus, batch_size=32, show_progress_bar=True, convert_to_numpy=True)
    
    # Build FAISS Index
    vector_dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(vector_dim)
    index.add(embeddings)
    
    print("Encoding complete. Cleaning up VRAM...")
    
    del embed_model
    gc.collect()
    torch.cuda.empty_cache()
    
    return index

# Build Indices sequentially
index = build_faiss_index_and_clean(corpus, RETRIEVER_MODEL)

print("SUCCESS: Indices built and GPU memory cleared.")

embed_model_cpu = SentenceTransformer(RETRIEVER_MODEL, device="cpu")

Loading Embedder sentence-transformers/all-MiniLM-L6-v2 on GPU...
Encoding 200000 documents...


Batches:   0%|          | 0/6250 [00:00<?, ?it/s]

Encoding complete. Cleaning up VRAM...
SUCCESS: Indices built and GPU memory cleared.


Pre-trained LLM model loading

In [5]:
# 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

try:
    print(f"Loading {GENERATOR_MODEL_ID}...")
    tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL_ID, trust_remote_code=True)
    
    model_qwen3 = AutoModelForCausalLM.from_pretrained(
        GENERATOR_MODEL_ID, 
        quantization_config=bnb_config, 
        device_map="auto",
        trust_remote_code=True
    )
    
    # Safety check for pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    print(f"Success! {GENERATOR_MODEL_ID} loaded.")

except Exception as e:
    print(f"Error loading Qwen3: {e}")

def generate_with_qwen3(prompt, content, temperature=0.7):
    messages = [
        {"role": "system", "content": content},
        {"role": "user", "content": prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(DEVICE)

    with torch.inference_mode():
        generated_ids = model_qwen3.generate(
            **model_inputs,
            max_new_tokens=512,
            temperature=temperature,
            top_p=0.8,         
            top_k=20,          
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response.strip()

Loading Qwen/Qwen3-4B-Instruct-2507...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Success! Qwen/Qwen3-4B-Instruct-2507 loaded.


Construct the query rag with a specific prompt

In [6]:
def query_rag(query, index, corpus, k=5):
    # Encode query
    q_emb = embed_model_cpu.encode([query])
    
    # Retrieve (FAISS)
    D, I = index.search(np.array(q_emb), k)
    retrieved_docs = [corpus[i] for i in I[0]]
    
    # Format Context
    context_str = "\n\n".join([f"Review {i+1}: {doc}" for i, doc in enumerate(retrieved_docs)])
    
    # Construct Prompt
    prompt = f"""Summarize the user's question based ONLY on the provided reviews below. 
If the reviews discuss different products, specify which product has which pros/cons.

Reviews:
{context_str}

Question:
{query}
"""
    return generate_with_qwen3(prompt, content="You are a helpful and professional assistant specialized in summarizing reviews.")

Build a LLM-as-a-Judge to evaluate with predefined criteria

In [7]:
def get_judge_score(question, source_text, summary_to_evaluate):
    prompt = f"""
You are a very strict, professional evaluator.
Act as an expert teacher grading a student answer.

User Question:
{question}

Source Text:
{source_text}

Summary to Evaluate:
{summary_to_evaluate}

Evaluate the summary from 1 to 10. You must prioritize the following CRITICAL criteria above all else:

1. ACCURACY (Critical): The summary must be factually correct and match the source text exactly.
2. FAITHFULNESS (Critical): The summary must NOT add information that is not explicitly supported by the source. Hallucinations should result in a low score.
3. RELEVANCE (Critical): The summary must directly answer the user's specific question.
4. COVERAGE (Critical): The summary must include the most important points needed to answer the question.

Secondary Criteria:
5. CONCISENESS: The summary is clear, brief, and avoids unnecessary details.
6. COHERENCE: The summary is logically structured.
7. REDUNDANCY: The summary avoids repetition.

Final instruction:
- If the summary fails on Accuracy or Faithfulness, the score must be below 5.
- Give a SINGLE score from 1 (very poor) to 10 (excellent).
- Reply ONLY with the number (e.g., 7 or 10). Do not write sentences.
"""
    try:
        score_text = generate_with_qwen3(
            prompt, 
            content="You are an impartial judge. Reply ONLY with a digit."
        )
        
        digits = ''.join(filter(str.isdigit, score_text))
        
        if not digits:
            return 3
            
        score = int(digits)
        
        return min(score, 10)
        
    except Exception as e:
        print(f"Judge Error: {e}")
        return 3

Before vs. After Retriever Comparison

In [8]:
# Setup Spacy TextRank
try:
    nlp = spacy.load("en_core_web_sm")
except:
    nlp = spacy.blank("en")

if "textrank" not in nlp.pipe_names:
    nlp.add_pipe("textrank")

# Setup BART (Abstractive Summarization)
bart_pipeline = pipeline("summarization", model="facebook/bart-large-cnn", device=0) # Change to -1 for CPU

queries = {
    "Beauty": "What do users with sensitive skin say about the texture and greasiness of the moisturizers?",
    "Music": "Do customers think the sound quality justifies the price, and are there complaints about comfort?"
}

results_table = []
k = 5  # Number of chunks to use

# We need this helper so we can force Qwen to use "Naive" data without searching
def manual_qwen_generate(context, question):
    prompt = f"""Summarize the user's question based ONLY on the provided reviews below.
Reviews:
{context}

Question:
{question}
"""
    return generate_with_qwen3(prompt, "You are a helpful and professional assistant specialized in summarizing reviews.")

# Comparison loop
for domain_name, query_text in queries.items():
    print(f"\nProcessing Domain: {domain_name} | Query: {query_text}")
    
    # ==========================================================
    # SCENARIO A: BEFORE RETRIEVER (NAIVE / RANDOM DATA)
    # We simulate "No Search" by just grabbing the first 5 docs from the dataset.
    # ==========================================================
    print("  > Running 'Before' (Naive) scenario...")
    
    naive_docs = corpus[:k]
    context_naive = "\n\n".join(naive_docs)
    
    # 1. Naive TextRank
    try:
        doc = nlp(context_naive[:100000])
        tr_naive = " ".join([s.text for s in doc._.textrank.summary(limit_sentences=2)])
    except: tr_naive = "Error"
    
    # 2. Naive BART
    try:
        bart_naive = bart_pipeline(context_naive, max_length=150, min_length=30, truncation=True)[0]['summary_text']
    except: bart_naive = "Error"
    
    # 3. Naive Qwen (Direct Generation, No Search)
    qwen_naive = manual_qwen_generate(context_naive, query_text)

    print("  > Judge is evaluating TextRank...")
    score_tr = get_judge_score(query_text, context_naive, tr_naive)
    
    print("  > Judge is evaluating BART...")
    score_bart = get_judge_score(query_text, context_naive, bart_naive)
    
    print("  > Judge is evaluating Qwen RAG...")
    score_qwen = get_judge_score(query_text, context_naive, qwen_naive)
    
    # Save "Before" Results
    results_table.append({"Domain": domain_name, "Mode": "Before (Naive)", "Model": "TextRank", "Output": tr_naive, "Judge Score": score_tr})
    results_table.append({"Domain": domain_name, "Mode": "Before (Naive)", "Model": "BART", "Output": bart_naive, "Judge Score": score_bart})
    results_table.append({"Domain": domain_name, "Mode": "Before (Naive)", "Model": "Qwen3", "Output": qwen_naive, "Judge Score": score_qwen})



    # ==========================================================
    # SCENARIO B: AFTER RETRIEVER (RAG / RELEVANT DATA)
    # We use the Retriever to find the top 5 BEST reviews.
    # ==========================================================
    print("  > Running 'After' (Retriever) scenario...")
    
    q_emb = embed_model_cpu.encode([query_text])
    D, I = index.search(np.array(q_emb), k)
    rag_docs = [corpus[i] for i in I[0]]
    context_rag = "\n\n".join(rag_docs)
    
    # 1. TextRank
    try:
        doc = nlp(context_rag[:100000])
        tr_retriever = " ".join([s.text for s in doc._.textrank.summary(limit_sentences=2)])
    except: tr_retriever = "Error"
    
    # 2. BART
    try:
        bart_retriever = bart_pipeline(context_rag, max_length=150, min_length=30, truncation=True)[0]['summary_text']
    except: bart_retriever = "Error"
    
    # 3. RAG Qwen (This is your standard RAG function)
    qwen_rag = query_rag(query_text, index, corpus, k)

    print("  > Judge is evaluating TextRank...")
    score_tr = get_judge_score(query_text, context_rag, tr_retriever)
    
    print("  > Judge is evaluating BART...")
    score_bart = get_judge_score(query_text, context_rag, bart_retriever)
    
    print("  > Judge is evaluating Qwen RAG...")
    score_qwen = get_judge_score(query_text, context_rag, qwen_rag)
    
    # Save "After" Results
    results_table.append({"Domain": domain_name, "Mode": "After (With Retriever)", "Model": "TextRank", "Output": tr_retriever, "Judge Score": score_tr})
    results_table.append({"Domain": domain_name, "Mode": "After (With Retriever)", "Model": "BART", "Output": bart_retriever, "Judge Score": score_bart})
    results_table.append({"Domain": domain_name, "Mode": "After (RAG)", "Model": "Qwen3", "Output": qwen_rag, "Judge Score": score_qwen})

df = pd.DataFrame(results_table)
pd.set_option('display.max_colwidth', None)

df = df.sort_values(by=["Domain", "Model", "Mode"]) 

print("\n--- FINAL COMPARISON: BEFORE vs AFTER RETRIEVER ---")
df

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing Domain: Beauty | Query: What do users with sensitive skin say about the texture and greasiness of the moisturizers?
  > Running 'Before' (Naive) scenario...
  > Judge is evaluating TextRank...
  > Judge is evaluating BART...
  > Judge is evaluating Qwen RAG...
  > Running 'After' (Retriever) scenario...
  > Judge is evaluating TextRank...
  > Judge is evaluating BART...
  > Judge is evaluating Qwen RAG...

Processing Domain: Music | Query: Do customers think the sound quality justifies the price, and are there complaints about comfort?
  > Running 'Before' (Naive) scenario...
  > Judge is evaluating TextRank...
  > Judge is evaluating BART...
  > Judge is evaluating Qwen RAG...
  > Running 'After' (Retriever) scenario...
  > Judge is evaluating TextRank...
  > Judge is evaluating BART...
  > Judge is evaluating Qwen RAG...

--- FINAL COMPARISON: BEFORE vs AFTER RETRIEVER ---


Unnamed: 0,Domain,Mode,Model,Output,Judge Score
4,Beauty,After (With Retriever),BART,"Review: Excellent for sensitive skin! Extremely moisturizing! Stock up because you're going to love how your skin feels and smells. Rating: 5.0 emphatically. Reviewer: ""I have pretty sensitive skin, and they feel great so far""",3
1,Beauty,Before (Naive),BART,"Review: This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want.",2
5,Beauty,After (RAG),Qwen3,"Users with sensitive skin praise the moisturizer for its smooth texture, quick absorption, and non-greasy feel. They highlight that it is extremely moisturizing, gentle, and does not cause breakouts or irritation. There is no mention of greasiness, as all reviews emphasize that the product is pleasant, non-irritating, and well-tolerated on sensitive skin. \n\nProduct: All reviews refer to the same moisturizer, with no distinction between different products.",10
2,Beauty,Before (Naive),Qwen3,"The provided reviews do not contain information about users with sensitive skin or their experiences with texture and greasiness of moisturizers. Therefore, based solely on the reviews, the question cannot be accurately summarized or answered.",0
3,Beauty,After (With Retriever),TextRank,Rating: 5.0\nTitle: Great for sensitive skin\nReview: Excellent for sensitive skin! so if you have sensitive skin it's just wonderful it doesn't make me break out is just so good\n\nRating:,8
0,Beauty,Before (Naive),TextRank,Rating: 1.0\nTitle: Synthetic feeling\nReview: Felt synthetic\n\nRating: 5.0\nTitle: A+\nReview: Love it I am comparing to other brands with yucky chemicals so I'm gonna stick with this.,2
10,Music,After (With Retriever),BART,"The Fit is snug, quality is silky and a good thickness. Very comfortable. Also recvd two. Great product at a good price.",2
7,Music,Before (Naive),BART,"Review: This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want.",0
11,Music,After (RAG),Qwen3,"The user is asking whether customers find the sound quality justifies the price and if there are complaints about comfort.\n\nBased on the reviews:\n\n- Sound quality is mentioned positively in Review 1 (""Excellent sound"") and is implied in Review 5 (""It's the quality and comfort as you see in picture""), though no explicit comparison to price is made.\n- Comfort is consistently praised: Reviews 3, 4, and 5 highlight comfort (""Super comfortable"", ""comfortable enough to wear all day"", ""very comfortable""), with no complaints.\n- Price is discussed in Reviews 2 and 3, where users note it is ""a little bit pricey"" or ""tad pricey,"" but still consider it ""worth it"" or ""good value,"" indicating that despite the cost, customers feel it's justified.\n\nNo customer complaints about comfort are present. The sound quality is viewed positively, and while the price is acknowledged as higher, it is not seen as a dealbreaker.\n\nFinal Summary: \nCustomers do not have complaints about comfort. Sound quality is considered excellent. While the product is priced higher than some alternatives (e.g., ""not buying the ears in the park""), customers believe the price is justified due to comfort, quality, and performance. \n\nThus, the answer to the user's question is: \nYes, customers think the sound quality justifies the price, and there are no complaints about comfort.",10
8,Music,Before (Naive),Qwen3,"The user's question is based on a misunderstanding of the review content. The reviews discuss a product related to scent and texture (likely a hair spray or similar), not sound quality or price justification. There are no mentions of sound quality, price, or comfort in the context of auditory or physical comfort. Therefore, the question is irrelevant to the provided reviews. \n\nCorrect summary: The user is asking about sound quality and comfort, but the reviews pertain to scent, texture, and fragrance preference—there are no references to sound quality or comfort in the context provided.",1


Benchmark the generator models, such as extractive, abstractive, and generative

In [9]:
rouge = evaluate.load('rouge')

ds_samsum_sample = ds_samsum.select(range(100))
print("--- RUNNING GENERATOR BENCHMARK ---")

eval_results = []

for i, item in enumerate(ds_samsum_sample):
    dialogue = item['dialogue']
    reference = item['summary']
    
    # TextRank
    try:
        doc = nlp(dialogue)
        tr_sum = " ".join([s.text for s in doc._.textrank.summary(limit_sentences=2)])
    except: tr_sum = ""
    
    # BART
    try:
        bart_out = bart_pipeline(dialogue, max_length=160, min_length=5, truncation=True)
        bart_sum = bart_out[0]['summary_text']
    except: bart_sum = ""
    
    # Qwen3
    prompt = f"""Summarize the conversation below in one sentence.
Conversation:
{dialogue}

Summary:"""
    
    qwen3_sum = generate_with_qwen3(prompt, content="You are a helpful assistant specialized in summarizing dialogues.")
    
    eval_results.append({
        "Reference": reference,
        "TextRank": tr_sum,
        "BART": bart_sum,
        "Qwen3": qwen3_sum
    })
    
# COMPUTE SCORES
print("\nCalculating ROUGE Scores...")
refs = [r['Reference'] for r in eval_results]
preds_tr = [r['TextRank'] for r in eval_results]
preds_bart = [r['BART'] for r in eval_results]
preds_qwen3 = [r['Qwen3'] for r in eval_results]

score_tr = rouge.compute(predictions=preds_tr, references=refs)
score_bart = rouge.compute(predictions=preds_bart, references=refs)
score_qwen3 = rouge.compute(predictions=preds_qwen3, references=refs)

df_scores = pd.DataFrame([
    {"Model": "TextRank", "ROUGE-1": score_tr['rouge1'], "ROUGE-L": score_tr['rougeL']},
    {"Model": "BART",     "ROUGE-1": score_bart['rouge1'], "ROUGE-L": score_bart['rougeL']},
    {"Model": "Qwen3",  "ROUGE-1": score_qwen3['rouge1'], "ROUGE-L": score_qwen3['rougeL']}
])

print("\n--- BENCHMARK RESULTS ---")
display(df_scores.round(4))

Your max_length is set to 160, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


--- RUNNING GENERATOR BENCHMARK ---


Your max_length is set to 160, but your input_length is only 156. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)
Your max_length is set to 160, but your input_length is only 141. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your max_length is set to 160, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 160, but your input_length is only 143. Since this is a summarization task, where outputs shorter than the input ar


Calculating ROUGE Scores...

--- BENCHMARK RESULTS ---


Unnamed: 0,Model,ROUGE-1,ROUGE-L
0,TextRank,0.2771,0.2121
1,BART,0.3197,0.2486
2,Qwen3,0.3922,0.302
