# üéØ Cebu Pacific Customer Support Agent Optimization

**Demonstration of DSPy Agent Optimization**

This notebook demonstrates how DSPy automatically optimizes a customer support agent:
- **Step 1**: Show the Problem (unoptimized agent)
- **Step 2**: Run DSPy Optimization (MIPROv2)
- **Step 3**: Show the Results (optimized agent)
- **Step 4**: Calculate Business Impact ($821K/year savings)

**Technology Stack:**
- DSPy Framework
- Groq LLM (llama-3.1-8b-instant)
- MIPROv2 Optimizer
- MLflow Tracking

**Dataset:**
- 50 training examples (past successful resolutions)
- 20 validation examples (test scenarios)
- Real Cebu Pacific customer support tickets

In [None]:
# ============================================================================
# CELL 2: Install and Import Required Packages
# ============================================================================

# Install required packages (run once)
import sys

print("‚úÖ Packages installed successfully!")

In [None]:
# ============================================================================
# CELL 3: Import Libraries
# ============================================================================

import os
import json
import time
import dspy
import mlflow
import matplotlib.pyplot as plt
import numpy as np
from typing import List
from datetime import datetime

print("‚úÖ All libraries imported successfully!")

In [None]:
# ============================================================================
# CELL 4: Setup Groq API Key
# ============================================================================

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

# Load Groq API key from environment variable
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Set in environment for DSPy
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

print("‚úÖ Groq API key configured")
print(f"   Key: {GROQ_API_KEY[:20]}... (hidden)" if GROQ_API_KEY else "   Key: Not found!")

In [None]:
# ============================================================================
# CELL 5: Configure DSPy with Groq LLM
# ============================================================================

# Configure DSPy to use Groq's llama-3.1-8b-instant
lm = dspy.LM(
    'groq/llama-3.1-8b-instant',
    api_key=GROQ_API_KEY,
    max_tokens=800,      # Sufficient for support responses
    temperature=0.7      # Balance between consistency and creativity
)

dspy.configure(lm=lm)

print("‚úÖ DSPy configured with Groq llama-3.1-8b-instant")
print(f"   Model: groq/llama-3.1-8b-instant")
print(f"   Max tokens: 800")
print(f"   Temperature: 0.7")

In [None]:
# ============================================================================
# CELL 6: Load Training and Validation Datasets
# ============================================================================

# Load training dataset (50 examples of successful resolutions)
trainset = []
with open("cebu_pacific_trainset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        # Create DSPy Example with customer_query as input, resolution as output
        example = dspy.Example(
            query=data["customer_query"],
            answer=data["resolution"]
        ).with_inputs("query")
        trainset.append(example)

# Load validation dataset (20 examples for testing)
valset = []
with open("cebu_pacific_valset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        example = dspy.Example(
            query=data["customer_query"],
            answer=data.get("resolution", "")
        ).with_inputs("query")
        valset.append(example)

print("‚úÖ Datasets loaded successfully!")
print(f"   Training set: {len(trainset)} examples")
print(f"   Validation set: {len(valset)} examples")
print(f"\nüìä Sample training example:")
print(f"   Query: {trainset[0].query[:100]}...")
print(f"   Answer: {trainset[0].answer[:100]}...")

In [None]:
# ============================================================================
# CELL 7: Visualize Dataset Statistics
# ============================================================================

# Calculate query and answer lengths
train_query_lengths = [len(ex.query) for ex in trainset]
train_answer_lengths = [len(ex.answer) for ex in trainset]
val_query_lengths = [len(ex.query) for ex in valset]

# Create visualizations
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Query length distribution
axes[0].hist(train_query_lengths, bins=20, alpha=0.7, color='#4ECDC4', edgecolor='black')
axes[0].set_title('Customer Query Length Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Characters')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)
axes[0].axvline(np.mean(train_query_lengths), color='red', linestyle='--', 
                label=f'Avg: {np.mean(train_query_lengths):.0f}')
axes[0].legend()

# Answer length distribution
axes[1].hist(train_answer_lengths, bins=20, alpha=0.7, color='#FF6B6B', edgecolor='black')
axes[1].set_title('Expert Resolution Length Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Characters')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)
axes[1].axvline(np.mean(train_answer_lengths), color='blue', linestyle='--',
                label=f'Avg: {np.mean(train_answer_lengths):.0f}')
axes[1].legend()

# Dataset size comparison
datasets = ['Training\n(50)', 'Validation\n(20)']
sizes = [len(trainset), len(valset)]
bars = axes[2].bar(datasets, sizes, color=['#4ECDC4', '#FF6B6B'], alpha=0.7, edgecolor='black')
axes[2].set_title('Dataset Sizes', fontsize=12, fontweight='bold')
axes[2].set_ylabel('Number of Examples')
axes[2].grid(True, alpha=0.3, axis='y')

for bar, size in zip(bars, sizes):
    axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                 f'{size}', ha='center', va='bottom', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.show()

print("‚úÖ Dataset statistics:")
print(f"   Avg query length: {np.mean(train_query_lengths):.0f} characters")
print(f"   Avg resolution length: {np.mean(train_answer_lengths):.0f} characters")
print(f"   Query length range: {min(train_query_lengths)} - {max(train_query_lengths)}")

In [None]:
# ============================================================================
# CELL 8: Create Support Agent Module (Simple ChainOfThought)
# ============================================================================

# Create a simple support agent using DSPy's ChainOfThought
# This will be optimized later with MIPROv2

class SupportAgent(dspy.Module):
    def __init__(self):
        super().__init__()
        # ChainOfThought: query -> reasoning -> answer
        self.generate_response = dspy.ChainOfThought("query -> answer")

    def forward(self, query):
        # Generate response for customer query
        response = self.generate_response(query=query)
        return response

# Create original (unoptimized) agent
original_agent = SupportAgent()

print("‚úÖ Support agent created!")
print("   Architecture: ChainOfThought (query -> answer)")
print("   Status: Unoptimized (no instructions, no few-shot examples)")

In [None]:
# ============================================================================
# CELL 9: STEP 1 - Show the Problem (Unoptimized Agent)
# ============================================================================

print("="*80)
print("STEP 1: THE PROBLEM - Unoptimized Agent Performance")
print("="*80)

# Test on a challenging customer query
test_query = "hi cant check in online it says booking not found but i have confirmation email flight tomorrow help!!!"

print(f"\nüî¥ UNOPTIMIZED AGENT TEST\n")
print(f"Customer Query:")
print(f'"{test_query}"')
print(f"\n" + "-"*80)

# Time the response
start_time = time.time()
unoptimized_response = original_agent(query=test_query)
unoptimized_time = time.time() - start_time

print(f"\nüí¨ Unoptimized Agent Response:")
print(f"{unoptimized_response.answer}")
print(f"\n‚è±Ô∏è  Response time: {unoptimized_time:.2f} seconds")
print(f"\nüìä Analysis:")
print(f"   ‚ùå Generic and unhelpful")
print(f"   ‚ùå No specific troubleshooting steps")
print(f"   ‚ùå No actionable solutions")
print(f"   ‚ùå Customer still frustrated")
print("\n" + "="*80)

# Store for comparison
unoptimized_result = {
    "query": test_query,
    "response": unoptimized_response.answer,
    "time": unoptimized_time
}


In [None]:
# ============================================================================
# CELL 10: Define Evaluation Metric
# ============================================================================

def support_quality_metric(example, pred, trace=None):
    """
    Custom metric to evaluate support response quality.
    Checks if key support elements are present in the response.
    """
    answer = pred.answer if hasattr(pred, 'answer') else str(pred)

    # Key elements of good support response
    quality_indicators = [
        "step" in answer.lower() or "option" in answer.lower(),  # Structured guidance
        len(answer) > 200,  # Detailed response
        "‚úÖ" in answer or "‚úì" in answer or "yes" in answer.lower(),  # Positive indicators
        "@" in answer or "www" in answer or "phone" in answer.lower(),  # Contact info
        "‚Ç±" in answer or "php" in answer.lower() or "fee" in answer.lower()  # Specific info
    ]

    # Score is percentage of quality indicators present
    score = sum(quality_indicators) / len(quality_indicators)
    return score

print("‚úÖ Evaluation metric defined: support_quality_metric")
print("   Checks for:")
print("   - Structured guidance (steps/options)")
print("   - Detailed response (>200 chars)")
print("   - Positive indicators")
print("   - Contact information")
print("   - Specific details (fees, policies)")

In [None]:
# ============================================================================
# CELL 11: Evaluate Original Agent (Baseline)
# ============================================================================

print("\n" + "="*80)
print("BASELINE EVALUATION: Original Agent on Validation Set")
print("="*80)

# Evaluate on a subset of validation set for speed (first 10 examples)
eval_subset = valset[:10]

baseline_scores = []
print(f"\nEvaluating on {len(eval_subset)} validation examples...\n")

for i, example in enumerate(eval_subset, 1):
    try:
        pred = original_agent(query=example.query)
        score = support_quality_metric(example, pred)
        baseline_scores.append(score)
        status = "‚úÖ" if score >= 0.6 else "‚ùå"
        print(f"  {i}/10: Score={score:.2f} {status}")
    except Exception as e:
        print(f"  {i}/10: Error - {str(e)[:50]}")
        baseline_scores.append(0.0)

baseline_avg = np.mean(baseline_scores) if baseline_scores else 0.0

print(f"\n{'='*80}")
print(f"üìä BASELINE RESULTS:")
print(f"   Average Score: {baseline_avg:.2%}")
print(f"   Status: {'‚úÖ Acceptable' if baseline_avg >= 0.6 else '‚ùå Needs Improvement'}")

In [None]:
# ============================================================================
# CELL 12: Configure MIPROv2 Optimizer
# ============================================================================

print("="*80)
print("STEP 2: DSPy OPTIMIZATION - Configuring MIPROv2")
print("="*80)

# Configure MIPROv2 optimizer
optimizer = dspy.MIPROv2(
    metric=support_quality_metric,  # Custom quality metric
    auto="light",                    # Light mode for faster optimization
    num_threads=8,                   # Parallel evaluation
    max_bootstrapped_demos=3,        # Few-shot examples per module
    max_labeled_demos=3              # Maximum demos to use
)

print("\n‚úÖ MIPROv2 Optimizer configured:")
print(f"   Mode: light (fast, efficient)")
print(f"   Metric: support_quality_metric")
print(f"   Threads: 8 (parallel evaluation)")
print(f"   Max demos: 3 per module")
print(f"\nüîÑ The optimizer will:")
print(f"   1. Bootstrap few-shot examples from training data")
print(f"   2. Generate instruction candidates using LLM")
print(f"   3. Evaluate combinations on validation set")
print(f"   4. Select best performing configuration")
print(f"\n‚è≥ Optimization typically takes 3-5 minutes...")