# Model Evaluation

This notebook evaluates the fine-tuned Falcon-7B model using BLEU scores and qualitative analysis.

In [None]:
!pip install -q torch transformers datasets peft bitsandbytes accelerate nltk pandas matplotlib seaborn

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

nltk.download('punkt')
nltk.download('punkt_tab')

## 1. Load Fine-Tuned Model

In [None]:
MODEL_NAME = "tiiuae/falcon-7b"
ADAPTER_PATH = "./falcon-7b-ecommerce-lora"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully")

## 2. Load Test Data

In [None]:
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
test_data = dataset['train'].shuffle(seed=42).select(range(100))
print(f"Test samples: {len(test_data)}")

## 3. Generate Responses

In [None]:
def generate_response(query):
    prompt = f"""### Instruction:
You are a helpful e-commerce customer support assistant. Answer the customer's question professionally and helpfully.

### Customer Query:
{query}

### Response:"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

In [None]:
results = []

for sample in tqdm(test_data, desc="Generating responses"):
    generated = generate_response(sample['instruction'])
    results.append({
        'query': sample['instruction'],
        'reference': sample['response'],
        'generated': generated,
        'category': sample['category']
    })

results_df = pd.DataFrame(results)
results_df.to_csv('evaluation_results.csv', index=False)
print("Results saved")

## 4. Calculate BLEU Scores

In [None]:
def calculate_bleu(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    smoothie = SmoothingFunction().method1
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu2 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    
    return bleu1, bleu2, bleu4

In [None]:
bleu_scores = []

for _, row in results_df.iterrows():
    b1, b2, b4 = calculate_bleu(row['reference'], row['generated'])
    bleu_scores.append({'bleu1': b1, 'bleu2': b2, 'bleu4': b4})

bleu_df = pd.DataFrame(bleu_scores)
results_df = pd.concat([results_df, bleu_df], axis=1)

In [None]:
print("BLEU Score Summary:")
print(f"BLEU-1: {results_df['bleu1'].mean():.4f} (+/- {results_df['bleu1'].std():.4f})")
print(f"BLEU-2: {results_df['bleu2'].mean():.4f} (+/- {results_df['bleu2'].std():.4f})")
print(f"BLEU-4: {results_df['bleu4'].mean():.4f} (+/- {results_df['bleu4'].std():.4f})")

## 5. Visualize Results

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(results_df['bleu1'], bins=20, edgecolor='black')
axes[0].set_title('BLEU-1 Distribution')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')

axes[1].hist(results_df['bleu2'], bins=20, edgecolor='black')
axes[1].set_title('BLEU-2 Distribution')
axes[1].set_xlabel('Score')

axes[2].hist(results_df['bleu4'], bins=20, edgecolor='black')
axes[2].set_title('BLEU-4 Distribution')
axes[2].set_xlabel('Score')

plt.tight_layout()
plt.savefig('bleu_distributions.png')
plt.show()

In [None]:
category_bleu = results_df.groupby('category')[['bleu1', 'bleu4']].mean().sort_values('bleu4', ascending=False)

plt.figure(figsize=(12, 6))
category_bleu.head(10).plot(kind='bar')
plt.title('BLEU Scores by Category (Top 10)')
plt.xlabel('Category')
plt.ylabel('BLEU Score')
plt.xticks(rotation=45, ha='right')
plt.legend(['BLEU-1', 'BLEU-4'])
plt.tight_layout()
plt.savefig('bleu_by_category.png')
plt.show()

## 6. Qualitative Analysis

In [None]:
print("=" * 80)
print("TOP 5 BEST RESPONSES (by BLEU-4)")
print("=" * 80)

top_samples = results_df.nlargest(5, 'bleu4')
for i, row in top_samples.iterrows():
    print(f"\nQuery: {row['query'][:100]}...")
    print(f"Reference: {row['reference'][:150]}...")
    print(f"Generated: {row['generated'][:150]}...")
    print(f"BLEU-4: {row['bleu4']:.4f}")
    print("-" * 40)

In [None]:
print("=" * 80)
print("BOTTOM 5 RESPONSES (by BLEU-4)")
print("=" * 80)

bottom_samples = results_df.nsmallest(5, 'bleu4')
for i, row in bottom_samples.iterrows():
    print(f"\nQuery: {row['query'][:100]}...")
    print(f"Reference: {row['reference'][:150]}...")
    print(f"Generated: {row['generated'][:150]}...")
    print(f"BLEU-4: {row['bleu4']:.4f}")
    print("-" * 40)

## 7. Response Length Analysis

In [None]:
results_df['ref_length'] = results_df['reference'].str.len()
results_df['gen_length'] = results_df['generated'].str.len()

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(results_df['ref_length'], results_df['gen_length'], alpha=0.5)
plt.plot([0, 1000], [0, 1000], 'r--')
plt.xlabel('Reference Length')
plt.ylabel('Generated Length')
plt.title('Response Length Comparison')

plt.subplot(1, 2, 2)
plt.scatter(results_df['gen_length'], results_df['bleu4'], alpha=0.5)
plt.xlabel('Generated Response Length')
plt.ylabel('BLEU-4 Score')
plt.title('Length vs BLEU-4')

plt.tight_layout()
plt.savefig('length_analysis.png')
plt.show()

## 8. Save Final Results

In [None]:
summary = {
    'metric': ['BLEU-1', 'BLEU-2', 'BLEU-4'],
    'mean': [results_df['bleu1'].mean(), results_df['bleu2'].mean(), results_df['bleu4'].mean()],
    'std': [results_df['bleu1'].std(), results_df['bleu2'].std(), results_df['bleu4'].std()],
    'min': [results_df['bleu1'].min(), results_df['bleu2'].min(), results_df['bleu4'].min()],
    'max': [results_df['bleu1'].max(), results_df['bleu2'].max(), results_df['bleu4'].max()]
}

summary_df = pd.DataFrame(summary)
summary_df.to_csv('evaluation_summary.csv', index=False)
print(summary_df)

## Summary

Evaluation complete. Output files:
- `evaluation_results.csv` - All generated responses with BLEU scores
- `evaluation_summary.csv` - Summary statistics
- `bleu_distributions.png` - BLEU score distributions
- `bleu_by_category.png` - Performance by category
- `length_analysis.png` - Response length analysis