# 12: Fine-Tuning and Deployment

**Duration:** 3-4 hours | **Difficulty:** Advanced

## Learning Objectives
- Model fine-tuning strategies
- Deployment optimization techniques
- Production monitoring and maintenance
- Complete chatbot deployment pipeline

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import json
import time
import os
from typing import Dict, Any

import sys
sys.path.append('../')
from utils.text_utils import SimpleTokenizer
from utils.model_helpers import get_device, count_parameters, save_checkpoint

device = get_device("auto")
print(f"Using device: {device}")

## Fine-Tuning Strategies

**Key approaches**: Transfer learning, domain adaptation, parameter-efficient tuning

In [None]:
class FineTuningManager:
    """Manage fine-tuning process for chatbot models."""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.training_history = []
    
    def freeze_layers(self, freeze_embeddings=True, freeze_layers=2):
        """Freeze specific layers for transfer learning."""
        frozen_params = 0
        total_params = 0
        
        for name, param in self.model.named_parameters():
            total_params += param.numel()
            
            if freeze_embeddings and 'embedding' in name.lower():
                param.requires_grad = False
                frozen_params += param.numel()
            elif any(f'layer.{i}' in name for i in range(freeze_layers)):
                param.requires_grad = False
                frozen_params += param.numel()
        
        print(f"Frozen {frozen_params:,} / {total_params:,} parameters")
        return frozen_params, total_params
    
    def setup_optimizer(self, learning_rate=2e-5):
        """Setup optimizer with layer-specific learning rates."""
        # Lower learning rate for embeddings
        embedding_params = [p for n, p in self.model.named_parameters() 
                          if 'embedding' in n.lower() and p.requires_grad]
        other_params = [p for n, p in self.model.named_parameters() 
                       if 'embedding' not in n.lower() and p.requires_grad]
        
        param_groups = [
            {'params': embedding_params, 'lr': learning_rate * 0.1},
            {'params': other_params, 'lr': learning_rate}
        ]
        
        self.optimizer = optim.AdamW(param_groups, weight_decay=0.01)
        print(f"Optimizer configured with {len(param_groups)} parameter groups")
    
    def fine_tune(self, dataloader, epochs=3):
        """Execute fine-tuning process."""
        criterion = nn.CrossEntropyLoss(ignore_index=0)
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            
            for batch_idx, (inputs, targets) in enumerate(dataloader):
                inputs, targets = inputs.to(device), targets.to(device)
                
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs.reshape(-1, outputs.size(-1)), targets.reshape(-1))
                loss.backward()
                
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                
                total_loss += loss.item()
                
                if batch_idx % 5 == 0:
                    print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')
            
            avg_loss = total_loss / len(dataloader)
            self.training_history.append({'epoch': epoch + 1, 'loss': avg_loss})
            print(f'Epoch {epoch+1} Average Loss: {avg_loss:.4f}')

# Load data
with open('../data/conversations/simple_qa_pairs.json', 'r') as f:
    conversations = [(item['question'], item['answer']) for item in json.load(f)]

tokenizer = SimpleTokenizer(vocab_size=1500)
all_text = [text for conv in conversations for text in conv]
tokenizer.fit(all_text)

print(f"Fine-tuning setup: {len(conversations)} conversations, {len(tokenizer.vocab)} vocab")

## Model Optimization and Deployment

**Optimization techniques**: Quantization, pruning, model export

In [None]:
class ModelOptimizer:
    """Optimize models for production deployment."""
    
    @staticmethod
    def quantize_model(model):
        """Apply dynamic quantization."""
        quantized = torch.quantization.quantize_dynamic(
            model, {nn.Linear}, dtype=torch.qint8
        )
        return quantized
    
    @staticmethod
    def benchmark_model(model, sample_input, runs=50):
        """Benchmark inference speed."""
        model.eval()
        
        # Warmup
        with torch.no_grad():
            for _ in range(5):
                _ = model(sample_input)
        
        # Benchmark
        start = time.time()
        with torch.no_grad():
            for _ in range(runs):
                _ = model(sample_input)
        end = time.time()
        
        avg_time = (end - start) / runs
        return {'avg_time_ms': avg_time * 1000, 'throughput': 1.0 / avg_time}

class DeploymentPipeline:
    """Complete deployment pipeline."""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = ModelOptimizer()
    
    def prepare_for_deployment(self, quantize=True):
        """Optimize model for deployment."""
        sample_input = torch.randint(0, len(self.tokenizer.vocab), (1, 20))
        
        # Original performance
        original_params = count_parameters(self.model)
        original_perf = self.optimizer.benchmark_model(self.model, sample_input)
        
        print(f"Original model:")
        print(f"  Parameters: {original_params['total']:,}")
        print(f"  Inference: {original_perf['avg_time_ms']:.2f} ms")
        
        # Apply optimizations
        optimized_model = self.model
        
        if quantize:
            optimized_model = self.optimizer.quantize_model(optimized_model)
            quantized_perf = self.optimizer.benchmark_model(optimized_model, sample_input)
            print(f"\nQuantized model:")
            print(f"  Inference: {quantized_perf['avg_time_ms']:.2f} ms")
            print(f"  Speedup: {original_perf['avg_time_ms'] / quantized_perf['avg_time_ms']:.2f}x")
        
        self.optimized_model = optimized_model
        return optimized_model
    
    def export_model(self, export_path):
        """Export model for deployment."""
        os.makedirs(os.path.dirname(export_path), exist_ok=True)
        
        # Export to TorchScript
        sample_input = torch.randint(0, len(self.tokenizer.vocab), (1, 20))
        scripted_model = torch.jit.trace(self.optimized_model, sample_input)
        scripted_model.save(export_path)
        
        # Save tokenizer
        tokenizer_path = export_path.replace('.pt', '_tokenizer.json')
        with open(tokenizer_path, 'w') as f:
            json.dump({
                'vocab': self.tokenizer.vocab,
                'special_tokens': self.tokenizer.special_tokens,
                'vocab_size': len(self.tokenizer.vocab)
            }, f)
        
        print(f"Model exported: {export_path}")
        print(f"Tokenizer saved: {tokenizer_path}")
        return export_path, tokenizer_path

# Demo model for optimization
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, d_model=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.linear1 = nn.Linear(d_model, d_model)
        self.linear2 = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x).mean(dim=1)
        x = torch.relu(self.linear1(x))
        return self.linear2(x)

# Create and optimize model
demo_model = SimpleModel(len(tokenizer.vocab)).to(device)
pipeline = DeploymentPipeline(demo_model, tokenizer)
optimized_model = pipeline.prepare_for_deployment(quantize=True)

# Export for deployment
model_path, tokenizer_path = pipeline.export_model('../models/production_model.pt')
print(f"\nDeployment ready!")

## Production Monitoring

**Key metrics**: Response time, accuracy, safety, user satisfaction

In [None]:
class ProductionMonitor:
    """Monitor chatbot performance in production."""
    
    def __init__(self):
        self.metrics = {
            'total_requests': 0,
            'successful_responses': 0,
            'response_times': [],
            'safety_flags': 0,
            'user_ratings': []
        }
        self.alerts = []
    
    def log_request(self, response_time, success=True):
        """Log request metrics."""
        self.metrics['total_requests'] += 1
        self.metrics['response_times'].append(response_time)
        
        if success:
            self.metrics['successful_responses'] += 1
        
        # Check for slow responses
        if response_time > 2.0:  # 2 second threshold
            self.alerts.append({
                'type': 'slow_response',
                'time': response_time,
                'timestamp': time.time()
            })
    
    def log_safety_flag(self, reason):
        """Log safety violations."""
        self.metrics['safety_flags'] += 1
        self.alerts.append({
            'type': 'safety_flag',
            'reason': reason,
            'timestamp': time.time()
        })
    
    def log_user_rating(self, rating):
        """Log user satisfaction rating (1-5)."""
        self.metrics['user_ratings'].append(rating)
    
    def get_dashboard(self):
        """Get monitoring dashboard data."""
        import numpy as np
        
        success_rate = (self.metrics['successful_responses'] / 
                       max(1, self.metrics['total_requests']))
        
        avg_response_time = (np.mean(self.metrics['response_times']) 
                           if self.metrics['response_times'] else 0)
        
        avg_rating = (np.mean(self.metrics['user_ratings']) 
                     if self.metrics['user_ratings'] else 0)
        
        return {
            'total_requests': self.metrics['total_requests'],
            'success_rate': success_rate,
            'avg_response_time_ms': avg_response_time * 1000,
            'safety_flags': self.metrics['safety_flags'],
            'avg_user_rating': avg_rating,
            'recent_alerts': self.alerts[-5:]
        }

class ProductionChatbot:
    """Production-ready chatbot with monitoring."""
    
    def __init__(self, model, tokenizer, monitor):
        self.model = model
        self.tokenizer = tokenizer
        self.monitor = monitor
        self.unsafe_words = ['hate', 'violence', 'harmful']
    
    def safety_check(self, text):
        """Basic safety filter."""
        text_lower = text.lower()
        for word in self.unsafe_words:
            if word in text_lower:
                return False, word
        return True, None
    
    def respond(self, user_input):
        """Generate response with monitoring."""
        start_time = time.time()
        
        try:
            # Safety check
            is_safe, flag = self.safety_check(user_input)
            if not is_safe:
                self.monitor.log_safety_flag(f'input_contains_{flag}')
                response = "I can't respond to that type of message."
            else:
                # Generate response (simplified)
                tokens = self.tokenizer.encode(user_input, add_special_tokens=True, max_length=50)
                input_tensor = torch.tensor([tokens]).to(device)
                
                with torch.no_grad():
                    output = self.model(input_tensor)
                    predicted = torch.argmax(output, dim=-1)
                    response = self.tokenizer.decode(predicted[0].cpu().tolist())
                
                # Safety check response
                response_safe, flag = self.safety_check(response)
                if not response_safe:
                    self.monitor.log_safety_flag(f'output_contains_{flag}')
                    response = "I apologize, but I need to rephrase my response."
            
            # Log successful request
            response_time = time.time() - start_time
            self.monitor.log_request(response_time, success=True)
            
            return {
                'response': response,
                'response_time': response_time,
                'safe': is_safe,
                'status': 'success'
            }
            
        except Exception as e:
            # Log failed request
            response_time = time.time() - start_time
            self.monitor.log_request(response_time, success=False)
            
            return {
                'response': "I'm having technical difficulties. Please try again.",
                'response_time': response_time,
                'safe': True,
                'status': 'error',
                'error': str(e)
            }

# Create production system
monitor = ProductionMonitor()
prod_chatbot = ProductionChatbot(optimized_model, tokenizer, monitor)

# Demo production usage
print("=== Production Chatbot Demo ===")
test_inputs = [
    "Hello, how are you?",
    "What is machine learning?",
    "Tell me about AI",
    "How can you help me?"
]

for user_input in test_inputs:
    result = prod_chatbot.respond(user_input)
    print(f"\nUser: {user_input}")
    print(f"Bot: {result['response']}")
    print(f"Time: {result['response_time']*1000:.1f}ms, Status: {result['status']}")
    
    # Simulate user feedback
    rating = 4  # Simulated rating
    monitor.log_user_rating(rating)

# Show monitoring dashboard
dashboard = monitor.get_dashboard()
print(f"\n=== Monitoring Dashboard ===")
print(f"Total Requests: {dashboard['total_requests']}")
print(f"Success Rate: {dashboard['success_rate']:.2%}")
print(f"Avg Response Time: {dashboard['avg_response_time_ms']:.1f}ms")
print(f"Safety Flags: {dashboard['safety_flags']}")
print(f"Avg User Rating: {dashboard['avg_user_rating']:.1f}/5")
print(f"Recent Alerts: {len(dashboard['recent_alerts'])}")

## Production Checklist

**Essential considerations** for deploying chatbots in production:

In [None]:
# Production deployment checklist
def print_production_checklist():
    """Complete checklist for production deployment."""
    
    checklist = {
        "Model Preparation": [
            "✓ Model trained and validated on diverse data",
            "✓ Fine-tuned for specific domain/use case",
            "✓ Quantized and optimized for inference",
            "✓ Exported to deployment format (TorchScript/ONNX)",
            "✓ Model versioning and artifact management"
        ],
        "Safety and Ethics": [
            "✓ Content filtering and safety checks implemented",
            "✓ Bias testing and mitigation strategies",
            "✓ Privacy protection measures",
            "✓ Clear user disclaimers about AI limitations",
            "✓ Human escalation mechanisms"
        ],
        "Infrastructure": [
            "✓ Scalable deployment architecture",
            "✓ Load balancing and auto-scaling",
            "✓ Fallback mechanisms for failures",
            "✓ Rate limiting and abuse prevention",
            "✓ Secure API endpoints"
        ],
        "Monitoring and Maintenance": [
            "✓ Real-time performance monitoring",
            "✓ Error tracking and alerting",
            "✓ User feedback collection",
            "✓ Regular model retraining pipeline",
            "✓ A/B testing framework"
        ],
        "Compliance and Documentation": [
            "✓ Data privacy compliance (GDPR, CCPA)",
            "✓ Security audits and penetration testing",
            "✓ API documentation and user guides",
            "✓ Incident response procedures",
            "✓ Regular compliance reviews"
        ]
    }
    
    print("\n🚀 PRODUCTION DEPLOYMENT CHECKLIST")
    print("=" * 50)
    
    for category, items in checklist.items():
        print(f"\n📋 {category}:")
        for item in items:
            print(f"  {item}")
    
    print("\n🎯 Best Practices:")
    print("• Start with safety-first design principles")
    print("• Implement gradual rollouts and canary deployments")
    print("• Maintain human oversight and intervention capabilities")
    print("• Continuously monitor and improve based on real usage")
    print("• Keep models updated with latest safety measures")
    
    print("\n⚠️  Remember:")
    print("• AI chatbots are tools to assist, not replace human judgment")
    print("• Transparency about AI capabilities builds user trust")
    print("• Regular audits ensure continued safety and fairness")

print_production_checklist()

print("\n" + "=" * 60)
print("🎉 CONGRATULATIONS! 🎉")
print("=" * 60)
print("\nYou have completed the Chatbot-Qoder tutorial series!")
print("\n📚 What you've learned:")
print("• PyTorch fundamentals and neural networks")
print("• Text preprocessing and tokenization")
print("• Rule-based and retrieval-based chatbots")
print("• Sequence models (RNN, LSTM, Seq2Seq)")
print("• Attention mechanisms and transformers")
print("• Generative models and advanced sampling")
print("• Fine-tuning and production deployment")
print("\n🚀 Next steps:")
print("• Experiment with larger pre-trained models")
print("• Explore domain-specific fine-tuning")
print("• Build and deploy your own chatbot")
print("• Contribute to open-source chatbot projects")
print("• Stay updated with latest research in conversational AI")
print("\nThank you for completing this comprehensive journey!")
print("Ready to build amazing conversational AI systems! 🤖✨")