# Lab 4 ‚Äî Building a Mini Evaluation Pipeline

**Module reference:** [Module 5, ¬ß5.4](https://github.com/kunalsuri/prompt-engineering-playbook/blob/main/learn/05-advanced-patterns.md)

This lab builds a complete prompt evaluation pipeline:
1. Define a test suite of (input, criteria) pairs
2. Run **two prompt variants** against the suite
3. Score outputs using **heuristics** and **LLM-as-Judge**
4. Aggregate metrics and declare a winner

This mirrors the methodology in the Advanced Patterns module and the `evaluation-template.md` shared resource.

---

### Free API Options
| Provider | Free Tier | Sign Up |
|---|---|---|
| **Google Gemini** (recommended) | 15 RPM, 1M tokens/day | [aistudio.google.com/apikey](https://aistudio.google.com/apikey) |
| **Groq** | 30 RPM, 14.4K tokens/min | [console.groq.com](https://console.groq.com) |
| **OpenAI** (paid) | Pay-per-token | [platform.openai.com](https://platform.openai.com/api-keys) |

In [None]:
#@title üîß Setup ‚Äî Run this cell first
!pip install -q openai pandas

import getpass, os, json, time

print("Choose your LLM provider (all work with this lab):")
print("  1. Google Gemini (FREE ‚Äî recommended)")
print("  2. Groq (FREE)")
print("  3. OpenAI (paid)")
choice = input("\nEnter 1, 2, or 3: ").strip()

if choice == "1":
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")
    BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
    API_KEY = os.environ["GOOGLE_API_KEY"]
    MODEL = "gemini-2.0-flash"
elif choice == "2":
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")
    BASE_URL = "https://api.groq.com/openai/v1"
    API_KEY = os.environ["GROQ_API_KEY"]
    MODEL = "llama-3.1-8b-instant"
else:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
    BASE_URL = None
    API_KEY = os.environ["OPENAI_API_KEY"]
    MODEL = "gpt-4o-mini"

from openai import OpenAI
client_kwargs = {"api_key": API_KEY}
if BASE_URL:
    client_kwargs["base_url"] = BASE_URL
client = OpenAI(**client_kwargs)

def complete(prompt, *, system="", temperature=0.3, max_tokens=1024):
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})
    response = client.chat.completions.create(
        model=MODEL, messages=messages,
        temperature=temperature, max_tokens=max_tokens
    )
    return response.choices[0].message.content or ""

print(f"\n‚úÖ Connected to {MODEL}")

## Test Suite & Prompt Variants

We test **email subject-line generation** with 5 diverse email bodies. Two prompt variants are compared:
- **Variant A** ‚Äî basic, minimal instruction
- **Variant B** ‚Äî constrained (length, verb usage, no spam words)

In [None]:
TEST_SUITE = [
    {
        "input": "We're launching a 30% off sale on all running shoes this weekend.",
        "criteria": "mentions discount percentage and product category",
    },
    {
        "input": "Reminder: your annual subscription renews in 3 days. Update payment info if needed.",
        "criteria": "conveys urgency and mentions renewal timeline",
    },
    {
        "input": "Thank you for attending our webinar on cloud security. Here are the slides and recording.",
        "criteria": "references the webinar topic and mentions deliverables",
    },
    {
        "input": "We've updated our privacy policy effective January 1. Please review the changes.",
        "criteria": "mentions policy update and effective date",
    },
    {
        "input": "Congratulations! You've been selected for early access to our new AI assistant.",
        "criteria": "conveys exclusivity and names the product",
    },
]

SYSTEM = "You are an email marketing specialist."

VARIANT_A = (
    "Write a subject line for this email:\n\n"
    "{body}\n\n"
    "Subject line:"
)

VARIANT_B = (
    "Write a compelling email subject line that is:\n"
    "- Under 60 characters\n"
    "- Action-oriented (uses a verb)\n"
    "- Specific about the key benefit or information\n"
    "- Free of spam trigger words (FREE, URGENT, !!!)\n\n"
    "Email body:\n{body}\n\n"
    "Subject line:"
)

print(f"Test suite: {len(TEST_SUITE)} cases")
print(f"Variant A: basic prompt ({len(VARIANT_A)} chars)")
print(f"Variant B: constrained prompt ({len(VARIANT_B)} chars)")

## Step 1 ‚Äî Generate Subject Lines

In [None]:
import pandas as pd

variant_outputs = {"A": [], "B": []}

for i, case in enumerate(TEST_SUITE):
    for variant_name, template in [("A", VARIANT_A), ("B", VARIANT_B)]:
        prompt = template.format(body=case["input"])
        raw = complete(prompt, system=SYSTEM, temperature=0.3)
        subject = raw.strip().strip('"').strip("'").splitlines()[0]
        variant_outputs[variant_name].append(subject)
    print(f"  Case {i+1}/5 done")
    time.sleep(0.3)  # gentle rate limiting

df_gen = pd.DataFrame({
    "Email (truncated)": [c["input"][:45] + "..." for c in TEST_SUITE],
    "Variant A": [s[:55] for s in variant_outputs["A"]],
    "Variant B": [s[:55] for s in variant_outputs["B"]],
})
display(df_gen)
print("\n‚úÖ All subject lines generated.")

## Step 2 ‚Äî Heuristic Checks (Length ‚â§ 60 chars)

In [None]:
rows = []
for i in range(len(TEST_SUITE)):
    a_subj = variant_outputs["A"][i]
    b_subj = variant_outputs["B"][i]
    rows.append({
        "Case": i + 1,
        "A Length": len(a_subj),
        "A Pass": "‚úì" if len(a_subj) <= 60 else "‚úó",
        "B Length": len(b_subj),
        "B Pass": "‚úì" if len(b_subj) <= 60 else "‚úó",
    })

df_len = pd.DataFrame(rows)
display(df_len)

a_pass = sum(1 for s in variant_outputs["A"] if len(s) <= 60)
b_pass = sum(1 for s in variant_outputs["B"] if len(s) <= 60)
print(f"\nLength compliance: Variant A = {a_pass}/{len(TEST_SUITE)}, Variant B = {b_pass}/{len(TEST_SUITE)}")

## Step 3 ‚Äî LLM-as-Judge Scoring

In [None]:
JUDGE_SYSTEM = "You are a strict email marketing evaluator. Return ONLY valid JSON."

JUDGE_TEMPLATE = (
    "Evaluate this email subject line on each criterion (1-5 scale).\n\n"
    "Email body: {body}\n"
    "Subject line: {subject}\n"
    "Quality criteria: {criteria}\n\n"
    "Score these dimensions and return ONLY a JSON object:\n"
    "- relevance: Does it accurately reflect the email content? (1-5)\n"
    "- clarity: Is it clear and easy to understand? (1-5)\n"
    "- engagement: Would it entice the reader to open the email? (1-5)\n"
    "- criteria_met: Does it meet the specific criteria above? (1-5)\n"
    "- conciseness: Is it appropriately brief? (1-5)\n\n"
    "JSON:"
)

dimensions = ["relevance", "clarity", "engagement", "criteria_met", "conciseness"]
all_scores = {"A": [], "B": []}

for i, case in enumerate(TEST_SUITE):
    for variant_name in ("A", "B"):
        subject = variant_outputs[variant_name][i]
        prompt = JUDGE_TEMPLATE.format(
            body=case["input"], subject=subject, criteria=case["criteria"]
        )
        raw = complete(prompt, system=JUDGE_SYSTEM, temperature=0.0)
        try:
            start = raw.find("{")
            end = raw.rfind("}") + 1
            scores = json.loads(raw[start:end]) if start >= 0 else {}
        except (json.JSONDecodeError, ValueError):
            scores = {}
        all_scores[variant_name].append(scores)
        time.sleep(0.3)
    print(f"  Case {i+1}/5 judged")

# Per-case detail
for i, case in enumerate(TEST_SUITE):
    print(f"\n  Email {i+1}: {case['input'][:50]}...")
    for v in ("A", "B"):
        s = all_scores[v][i]
        score_str = ", ".join(f"{d}={s.get(d, '?')}" for d in dimensions)
        print(f"    Variant {v}: {score_str}")

print("\n‚úÖ Judging complete!")

## Step 4 ‚Äî Aggregate Results & Declare Winner

In [None]:
def avg_score(variant, dim):
    vals = [s.get(dim, 0) for s in all_scores[variant] if isinstance(s.get(dim), (int, float))]
    return sum(vals) / len(vals) if vals else 0.0

rows = []
a_wins, b_wins = 0, 0
for dim in dimensions:
    a_avg = avg_score("A", dim)
    b_avg = avg_score("B", dim)
    if a_avg > b_avg:
        winner = "A"; a_wins += 1
    elif b_avg > a_avg:
        winner = "B"; b_wins += 1
    else:
        winner = "Tie"
    rows.append({"Dimension": dim.replace('_', ' ').title(), "Variant A": f"{a_avg:.1f}", "Variant B": f"{b_avg:.1f}", "Winner": winner})

# Add length compliance row
rows.append({"Dimension": "Length ‚â§60", "Variant A": f"{a_pass}/{len(TEST_SUITE)}", "Variant B": f"{b_pass}/{len(TEST_SUITE)}",
             "Winner": "A" if a_pass > b_pass else ("B" if b_pass > a_pass else "Tie")})

df_results = pd.DataFrame(rows)
display(df_results)

overall = "Variant B" if b_wins > a_wins else ("Variant A" if a_wins > b_wins else "Tie")
print(f"\nüèÜ Variant A won {a_wins} dimensions, Variant B won {b_wins} dimensions.")
print(f"   Overall winner: {overall}")
print()
print("üìù Takeaway: Variant B's explicit constraints (length, verb usage, no spam words)")
print("give the model concrete success criteria, producing more consistent, higher-quality")
print("output. This demonstrates the value of building evaluation pipelines to objectively")
print("compare prompt variants.")
print("See Module 5, ¬ß5.4 and prompts/shared/evaluation-template.md for methodology.")