# Comparative Evaluation: Zero-Shot vs Fine-Tuned vs RAG

This notebook compares three approaches for e-commerce FAQ response generation:
1. **Zero-shot**: Base Falcon-7B without fine-tuning
2. **Fine-tuned**: Falcon-7B with LoRA adapters
3. **RAG**: Retrieval-Augmented Generation with vector similarity search

In [None]:
!pip install -q torch transformers datasets peft bitsandbytes accelerate nltk pandas matplotlib seaborn faiss-cpu sentence-transformers

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm

nltk.download('punkt')
nltk.download('punkt_tab')

## 1. Load Models and Data

In [None]:
MODEL_NAME = "tiiuae/falcon-7b"
ADAPTER_PATH = "./falcon-7b-ecommerce-lora"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

finetuned_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
print("Models loaded")

In [None]:
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
full_data = dataset['train']
test_data = full_data.shuffle(seed=42).select(range(50))
print(f"Test samples: {len(test_data)}")

## 2. Setup RAG System

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

rag_data = full_data.select(range(min(5000, len(full_data))))
instructions = [item['instruction'] for item in rag_data]
responses = [item['response'] for item in rag_data]

print("Creating embeddings...")
embeddings = embedding_model.encode(instructions, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index created with {index.ntotal} vectors")

In [None]:
def retrieve_similar(query, k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding).astype('float32'), k)
    
    retrieved = []
    for idx in indices[0]:
        retrieved.append({
            'instruction': instructions[idx],
            'response': responses[idx]
        })
    return retrieved

## 3. Define Generation Functions

In [None]:
def generate_zero_shot(query):
    prompt = f"""### Instruction:
You are a helpful e-commerce customer support assistant. Answer the customer's question professionally and helpfully.

### Customer Query:
{query}

### Response:"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

In [None]:
def generate_finetuned(query):
    prompt = f"""### Instruction:
You are a helpful e-commerce customer support assistant. Answer the customer's question professionally and helpfully.

### Customer Query:
{query}

### Response:"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)
    
    with torch.no_grad():
        outputs = finetuned_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

In [None]:
def generate_rag(query):
    retrieved = retrieve_similar(query, k=3)
    
    context = ""
    for i, item in enumerate(retrieved, 1):
        context += f"Example {i}:\nQ: {item['instruction']}\nA: {item['response']}\n\n"
    
    prompt = f"""### Instruction:
You are a helpful e-commerce customer support assistant. Use the following examples to help answer the customer's question.

### Context:
{context}

### Customer Query:
{query}

### Response:"""
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(base_model.device)
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

## 4. Run Comparative Evaluation

In [None]:
def calculate_bleu(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    smoothie = SmoothingFunction().method1
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    
    return bleu1, bleu4

In [None]:
results = []

for sample in tqdm(test_data, desc="Evaluating"):
    query = sample['instruction']
    reference = sample['response']
    
    zero_shot_resp = generate_zero_shot(query)
    finetuned_resp = generate_finetuned(query)
    rag_resp = generate_rag(query)
    
    zs_b1, zs_b4 = calculate_bleu(reference, zero_shot_resp)
    ft_b1, ft_b4 = calculate_bleu(reference, finetuned_resp)
    rag_b1, rag_b4 = calculate_bleu(reference, rag_resp)
    
    results.append({
        'query': query,
        'reference': reference,
        'zero_shot_response': zero_shot_resp,
        'finetuned_response': finetuned_resp,
        'rag_response': rag_resp,
        'zero_shot_bleu1': zs_b1,
        'zero_shot_bleu4': zs_b4,
        'finetuned_bleu1': ft_b1,
        'finetuned_bleu4': ft_b4,
        'rag_bleu1': rag_b1,
        'rag_bleu4': rag_b4
    })

results_df = pd.DataFrame(results)
results_df.to_csv('comparison_results.csv', index=False)

## 5. Analyze Results

In [None]:
summary = pd.DataFrame({
    'Approach': ['Zero-Shot', 'Fine-Tuned', 'RAG'],
    'BLEU-1 Mean': [
        results_df['zero_shot_bleu1'].mean(),
        results_df['finetuned_bleu1'].mean(),
        results_df['rag_bleu1'].mean()
    ],
    'BLEU-1 Std': [
        results_df['zero_shot_bleu1'].std(),
        results_df['finetuned_bleu1'].std(),
        results_df['rag_bleu1'].std()
    ],
    'BLEU-4 Mean': [
        results_df['zero_shot_bleu4'].mean(),
        results_df['finetuned_bleu4'].mean(),
        results_df['rag_bleu4'].mean()
    ],
    'BLEU-4 Std': [
        results_df['zero_shot_bleu4'].std(),
        results_df['finetuned_bleu4'].std(),
        results_df['rag_bleu4'].std()
    ]
})

print("\nComparative Results Summary:")
print(summary.to_string(index=False))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

x = np.arange(3)
width = 0.35

axes[0].bar(x, summary['BLEU-1 Mean'], width, yerr=summary['BLEU-1 Std'], capsize=5)
axes[0].set_ylabel('BLEU-1 Score')
axes[0].set_title('BLEU-1 Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(summary['Approach'])

axes[1].bar(x, summary['BLEU-4 Mean'], width, yerr=summary['BLEU-4 Std'], capsize=5, color='orange')
axes[1].set_ylabel('BLEU-4 Score')
axes[1].set_title('BLEU-4 Comparison')
axes[1].set_xticks(x)
axes[1].set_xticklabels(summary['Approach'])

plt.tight_layout()
plt.savefig('comparison_chart.png', dpi=150)
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(results_df['zero_shot_bleu4'], bins=15, alpha=0.7, edgecolor='black')
axes[0].set_title('Zero-Shot BLEU-4')
axes[0].set_xlabel('Score')

axes[1].hist(results_df['finetuned_bleu4'], bins=15, alpha=0.7, edgecolor='black', color='orange')
axes[1].set_title('Fine-Tuned BLEU-4')
axes[1].set_xlabel('Score')

axes[2].hist(results_df['rag_bleu4'], bins=15, alpha=0.7, edgecolor='black', color='green')
axes[2].set_title('RAG BLEU-4')
axes[2].set_xlabel('Score')

plt.tight_layout()
plt.savefig('score_distributions.png', dpi=150)
plt.show()

## 6. Qualitative Comparison

In [None]:
print("=" * 100)
print("SAMPLE COMPARISONS")
print("=" * 100)

for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"\n{'='*100}")
    print(f"Query: {row['query'][:100]}...")
    print(f"\nReference: {row['reference'][:200]}...")
    print(f"\nZero-Shot (BLEU-4: {row['zero_shot_bleu4']:.3f}): {row['zero_shot_response'][:200]}...")
    print(f"\nFine-Tuned (BLEU-4: {row['finetuned_bleu4']:.3f}): {row['finetuned_response'][:200]}...")
    print(f"\nRAG (BLEU-4: {row['rag_bleu4']:.3f}): {row['rag_response'][:200]}...")

## 7. Win Rate Analysis

In [None]:
def get_winner(row):
    scores = {
        'Zero-Shot': row['zero_shot_bleu4'],
        'Fine-Tuned': row['finetuned_bleu4'],
        'RAG': row['rag_bleu4']
    }
    return max(scores, key=scores.get)

results_df['winner'] = results_df.apply(get_winner, axis=1)
win_counts = results_df['winner'].value_counts()

print("\nWin Rate Analysis (based on BLEU-4):")
for approach, count in win_counts.items():
    print(f"{approach}: {count} wins ({count/len(results_df)*100:.1f}%)")

In [None]:
plt.figure(figsize=(8, 6))
colors = ['#ff9999', '#66b3ff', '#99ff99']
plt.pie(win_counts.values, labels=win_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('Win Rate Distribution (BLEU-4)')
plt.savefig('win_rate.png', dpi=150)
plt.show()

## 8. Save Summary

In [None]:
summary.to_csv('comparison_summary.csv', index=False)
print("Results saved to comparison_summary.csv")

## Key Findings

### 1. Fine-Tuning Advantages
- Best performance on domain-specific queries
- Learns e-commerce terminology and response patterns
- Consistent response quality

### 2. Zero-Shot Limitations
- Generic responses lacking domain context
- Struggles with specific e-commerce scenarios
- Lower BLEU scores overall

### 3. RAG Trade-offs
- Good for factual queries with similar examples
- Performance depends on retrieval quality
- No training required, faster to deploy

### 4. Recommendations
- Use fine-tuning for production customer support
- RAG is suitable for knowledge-intensive queries
- Consider hybrid approaches for best results