# Week 6: Best Practices and Production Patterns

## Learning Objectives
- Implement robust error handling
- Optimize costs and performance
- Test LLM applications effectively
- Design for production deployment
- Consider security and privacy

In [None]:
import os
import time
from dotenv import load_dotenv
from openai import OpenAI, OpenAIError, RateLimitError, APIError

load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

## Part 1: Error Handling

In [None]:
def robust_api_call(messages, max_retries=3, **kwargs):
    """Make API call with retry logic"""
    
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=kwargs.get('model', 'gpt-4o-mini'),
                messages=messages,
                temperature=kwargs.get('temperature', 0.7),
                max_tokens=kwargs.get('max_tokens', None)
            )
            return response.choices[0].message.content
            
        except RateLimitError:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Rate limit hit. Waiting {wait_time}s...")
                time.sleep(wait_time)
            else:
                return "Error: Rate limit exceeded. Please try again later."
                
        except APIError as e:
            print(f"API error: {e}")
            if attempt < max_retries - 1:
                time.sleep(1)
            else:
                return f"Error: API unavailable - {str(e)}"
                
        except Exception as e:
            print(f"Unexpected error: {e}")
            return f"Error: {str(e)}"
    
    return "Error: Max retries exceeded"

# Test it
result = robust_api_call(
    messages=[{"role": "user", "content": "Say hello!"}]
)
print(result)

## Part 2: Cost Management

In [None]:
class CostTracker:
    """Track API costs across calls"""
    
    PRICING = {
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},  # per 1M tokens
        "gpt-4o": {"input": 2.50, "output": 10.00},
        "text-embedding-3-small": {"input": 0.02, "output": 0}
    }
    
    def __init__(self):
        self.calls = []
        self.total_cost = 0.0
    
    def track_completion(self, response, model="gpt-4o-mini"):
        """Track a completion call"""
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        
        cost = (
            (input_tokens / 1_000_000) * self.PRICING[model]["input"] +
            (output_tokens / 1_000_000) * self.PRICING[model]["output"]
        )
        
        self.calls.append({
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost": cost
        })
        
        self.total_cost += cost
        return cost
    
    def report(self):
        """Generate cost report"""
        if not self.calls:
            return "No calls tracked"
        
        total_input = sum(c['input_tokens'] for c in self.calls)
        total_output = sum(c['output_tokens'] for c in self.calls)
        
        return f"""Cost Report:
Total calls: {len(self.calls)}
Total input tokens: {total_input:,}
Total output tokens: {total_output:,}
Total cost: ${self.total_cost:.6f}
Average cost per call: ${self.total_cost/len(self.calls):.6f}"""

# Test it
tracker = CostTracker()

for i in range(3):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": f"Count to {i+1}"}]
    )
    cost = tracker.track_completion(response)
    print(f"Call {i+1} cost: ${cost:.6f}")

print("\n" + tracker.report())

## Part 3: Testing Strategies

In [None]:
class LLMTestSuite:
    """Test suite for LLM applications"""
    
    def __init__(self):
        self.tests = []
        self.results = []
    
    def add_test(self, name, prompt, expected_contains=None, expected_not_contains=None):
        """Add a test case"""
        self.tests.append({
            "name": name,
            "prompt": prompt,
            "expected_contains": expected_contains or [],
            "expected_not_contains": expected_not_contains or []
        })
    
    def run_tests(self, temperature=0):
        """Run all tests"""
        print(f"Running {len(self.tests)} tests...\n")
        
        for test in self.tests:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": test["prompt"]}],
                temperature=temperature
            )
            
            output = response.choices[0].message.content.lower()
            
            # Check expectations
            passed = True
            failures = []
            
            for expected in test["expected_contains"]:
                if expected.lower() not in output:
                    passed = False
                    failures.append(f"Missing: {expected}")
            
            for not_expected in test["expected_not_contains"]:
                if not_expected.lower() in output:
                    passed = False
                    failures.append(f"Should not contain: {not_expected}")
            
            result = {
                "name": test["name"],
                "passed": passed,
                "output": output,
                "failures": failures
            }
            
            self.results.append(result)
            
            status = "✓ PASS" if passed else "✗ FAIL"
            print(f"{status}: {test['name']}")
            if failures:
                for f in failures:
                    print(f"  - {f}")
        
        # Summary
        passed_count = sum(1 for r in self.results if r['passed'])
        print(f"\n{passed_count}/{len(self.tests)} tests passed")

# Example tests
suite = LLMTestSuite()

suite.add_test(
    "Math calculation",
    "What is 15 + 27? Reply with only the number.",
    expected_contains=["42"]
)

suite.add_test(
    "Medical terminology",
    "Translate 'hypertension' to plain language in one sentence.",
    expected_contains=["blood pressure", "high"],
    expected_not_contains=["hypertension"]  # Should use plain language
)

suite.run_tests()

## Part 4: Caching for Cost Reduction

In [None]:
import hashlib
import json

class CachedLLM:
    """LLM with response caching"""
    
    def __init__(self):
        self.cache = {}
        self.hits = 0
        self.misses = 0
    
    def _make_key(self, messages, **kwargs):
        """Create cache key from request"""
        key_dict = {
            "messages": messages,
            "model": kwargs.get('model', 'gpt-4o-mini'),
            "temperature": kwargs.get('temperature', 0.7)
        }
        key_str = json.dumps(key_dict, sort_keys=True)
        return hashlib.md5(key_str.encode()).hexdigest()
    
    def complete(self, messages, **kwargs):
        """Complete with caching"""
        key = self._make_key(messages, **kwargs)
        
        if key in self.cache:
            self.hits += 1
            print("  [Cache hit]")
            return self.cache[key]
        
        self.misses += 1
        print("  [Cache miss - calling API]")
        
        response = client.chat.completions.create(
            model=kwargs.get('model', 'gpt-4o-mini'),
            messages=messages,
            **kwargs
        )
        
        result = response.choices[0].message.content
        self.cache[key] = result
        return result
    
    def stats(self):
        total = self.hits + self.misses
        hit_rate = self.hits / total if total > 0 else 0
        return f"Cache stats: {self.hits} hits, {self.misses} misses ({hit_rate:.1%} hit rate)"

# Test caching
cached_llm = CachedLLM()

# Same request multiple times
for i in range(3):
    print(f"\nRequest {i+1}:")
    result = cached_llm.complete(
        [{"role": "user", "content": "What is the capital of France?"}],
        temperature=0
    )

print("\n" + cached_llm.stats())

## Part 5: Input Validation and Safety

In [None]:
class SafeLLMApp:
    """LLM application with input validation"""
    
    MAX_INPUT_LENGTH = 2000  # characters
    MAX_TOKENS = 500
    
    @staticmethod
    def validate_input(user_input):
        """Validate user input"""
        if not user_input or not user_input.strip():
            return False, "Input cannot be empty"
        
        if len(user_input) > SafeLLMApp.MAX_INPUT_LENGTH:
            return False, f"Input too long (max {SafeLLMApp.MAX_INPUT_LENGTH} chars)"
        
        # Add other validation as needed
        # - Check for malicious patterns
        # - Filter PII if required
        # - Check for prompt injection attempts
        
        return True, "OK"
    
    @staticmethod
    def sanitize_output(output):
        """Clean up LLM output"""
        # Remove potential PII or sensitive info
        # Format output consistently
        # Add disclaimers if needed
        return output.strip()
    
    @staticmethod
    def safe_query(user_input, system_message="You are a helpful assistant."):
        """Process query safely"""
        # Validate
        valid, message = SafeLLMApp.validate_input(user_input)
        if not valid:
            return {"error": message}
        
        try:
            # Call API with limits
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_input}
                ],
                max_tokens=SafeLLMApp.MAX_TOKENS,
                temperature=0.7
            )
            
            output = response.choices[0].message.content
            return {"response": SafeLLMApp.sanitize_output(output)}
            
        except Exception as e:
            return {"error": f"Processing failed: {str(e)}"}

# Test safe query
result = SafeLLMApp.safe_query("What is machine learning?")
print(result)

# Test validation
result = SafeLLMApp.safe_query("")  # Empty input
print(result)

result = SafeLLMApp.safe_query("x" * 3000)  # Too long
print(result)

## Part 6: Production Checklist

### Before deploying to production:

#### Security
- [ ] API keys stored securely (environment variables, secret manager)
- [ ] Input validation implemented
- [ ] Output sanitization for PII
- [ ] Rate limiting on user requests
- [ ] Prompt injection protection

#### Reliability
- [ ] Error handling for all API calls
- [ ] Retry logic with exponential backoff
- [ ] Timeout handling
- [ ] Graceful degradation when API unavailable
- [ ] Logging for debugging

#### Cost Management
- [ ] Cost tracking implemented
- [ ] Token limits set appropriately
- [ ] Caching for repeated queries
- [ ] Budget alerts configured
- [ ] Regular cost monitoring

#### Quality
- [ ] Test suite with expected behaviors
- [ ] Edge case testing
- [ ] Output quality evaluation
- [ ] User feedback mechanism
- [ ] A/B testing capability

#### User Experience
- [ ] Clear error messages
- [ ] Loading indicators
- [ ] Source attribution (for RAG)
- [ ] Disclaimers where appropriate
- [ ] Feedback collection

## Key Takeaways

1. **Always handle errors** - APIs fail, networks drop, limits hit
2. **Track costs** - They add up quickly in production
3. **Test systematically** - LLMs are probabilistic, test thoroughly
4. **Cache when possible** - Save money and improve speed
5. **Validate inputs** - Protect against misuse and errors
6. **Monitor in production** - Watch costs, errors, and quality

## Congratulations!

You've completed the series! You now know how to:
- Work with LLM APIs
- Build conversations
- Engineer prompts programmatically
- Use embeddings for semantic search
- Build RAG systems
- Deploy production-ready applications

### Next Steps
- Build real applications in your domain
- Explore advanced topics (fine-tuning, agents, etc.)
- Share your work with colleagues
- Keep learning as the field evolves!