# Lab 14 - Part 3: Monitoring and Logging

## Overview
This notebook covers production monitoring and observability including:
- Production monitoring system
- Prometheus metrics collection
- System metrics (CPU, memory)
- Neo4j database metrics
- Logging and audit system

**Duration:** 10 minutes  
**Prerequisites:** Parts 1 and 2 completed

## Prerequisites

Import necessary modules and check for monitoring libraries.

In [None]:
# Standard library imports
import time
import json
import logging
from datetime import datetime, timedelta

# Check for monitoring libraries
try:
    from prometheus_client import Counter, Histogram, Gauge, start_http_server
    PROMETHEUS_AVAILABLE = True
    print("✓ Prometheus client available")
except ImportError:
    PROMETHEUS_AVAILABLE = False
    print("⚠️ Prometheus client not available - metrics will be simulated")

try:
    import psutil
    PSUTIL_AVAILABLE = True
    print("✓ Psutil library available")
except ImportError:
    PSUTIL_AVAILABLE = False
    print("⚠️ Psutil not available - system metrics will be simulated")

try:
    from neo4j import GraphDatabase
    NEO4J_AVAILABLE = True
    print("✓ Neo4j driver available")
except ImportError:
    NEO4J_AVAILABLE = False
    print("⚠️ Neo4j driver not available")

## Production Monitoring System

Create a comprehensive monitoring system with Prometheus metrics and health checks.

In [None]:
class ProductionMonitoringSystem:
    """Comprehensive monitoring system for production deployment"""
    
    def __init__(self):
        # Initialize Prometheus metrics with fallback
        if PROMETHEUS_AVAILABLE:
            self.request_count = Counter('neo4j_app_requests_total', 'Total app requests', ['method', 'endpoint'])
            self.request_duration = Histogram('neo4j_app_request_duration_seconds', 'Request duration')
            self.active_connections = Gauge('neo4j_app_active_connections', 'Active database connections')
            self.error_count = Counter('neo4j_app_errors_total', 'Total errors', ['error_type'])
            
            # System metrics
            self.cpu_usage = Gauge('system_cpu_usage_percent', 'CPU usage percentage')
            self.memory_usage = Gauge('system_memory_usage_percent', 'Memory usage percentage')
            self.disk_usage = Gauge('system_disk_usage_percent', 'Disk usage percentage')
            
            # Neo4j specific metrics
            self.neo4j_query_count = Counter('neo4j_queries_total', 'Total Neo4j queries', ['query_type'])
            self.neo4j_query_duration = Histogram('neo4j_query_duration_seconds', 'Neo4j query duration')
            self.neo4j_connection_errors = Counter('neo4j_connection_errors_total', 'Neo4j connection errors')
            
            # Health status
            self.service_health = Gauge('service_health_status', 'Service health status (1=healthy, 0=unhealthy)')
            
            # Start metrics server
            try:
                start_http_server(9090)
                print("✓ Prometheus metrics server started on port 9090")
            except Exception as e:
                print(f"⚠️ Could not start Prometheus server: {e}")
        else:
            # Mock metrics for fallback
            self.metrics_data = {
                'requests_total': 0,
                'errors_total': 0,
                'active_connections': 0,
                'service_health': 1
            }
            print("⚠️ Using simulated metrics - install prometheus-client for production monitoring")
    
    def collect_system_metrics(self):
        """Collect system performance metrics with fallback"""
        try:
            if PSUTIL_AVAILABLE:
                # Real system metrics
                cpu_percent = psutil.cpu_percent(interval=1)
                memory = psutil.virtual_memory()
                disk = psutil.disk_usage('/')
                disk_percent = (disk.used / disk.total) * 100
                
                if PROMETHEUS_AVAILABLE:
                    self.cpu_usage.set(cpu_percent)
                    self.memory_usage.set(memory.percent)
                    self.disk_usage.set(disk_percent)
                
                return {
                    'cpu_percent': cpu_percent,
                    'memory_percent': memory.percent,
                    'disk_percent': disk_percent,
                    'timestamp': datetime.now().isoformat()
                }
            else:
                # Simulated metrics
                import random
                cpu_percent = random.uniform(15, 45)
                memory_percent = random.uniform(40, 70)
                disk_percent = random.uniform(25, 60)
                
                return {
                    'cpu_percent': cpu_percent,
                    'memory_percent': memory_percent,
                    'disk_percent': disk_percent,
                    'timestamp': datetime.now().isoformat(),
                    'simulated': True
                }
                
        except Exception as e:
            print(f"Error collecting system metrics: {e}")
            return {
                'cpu_percent': 0,
                'memory_percent': 0,
                'disk_percent': 0,
                'timestamp': datetime.now().isoformat(),
                'error': str(e)
            }
    
    def collect_neo4j_metrics(self, driver):
        """Collect Neo4j database metrics"""
        try:
            with driver.session() as session:
                metrics = {}
                
                # Execute queries safely
                try:
                    result = session.run("MATCH (n) RETURN count(n) as node_count")
                    metrics['node_count'] = result.single()['node_count']
                except:
                    metrics['node_count'] = 0
                
                try:
                    result = session.run("MATCH ()-[r]->() RETURN count(r) as rel_count")
                    metrics['relationship_count'] = result.single()['rel_count']
                except:
                    metrics['relationship_count'] = 0
                
                # Test query performance
                start_time = time.time()
                session.run("RETURN 1").consume()
                query_time = time.time() - start_time
                
                metrics.update({
                    'query_response_time': query_time,
                    'timestamp': datetime.now().isoformat(),
                    'database_status': 'healthy' if query_time < 1.0 else 'degraded'
                })
                
                return metrics
                
        except Exception as e:
            print(f"Error collecting Neo4j metrics: {e}")
            if PROMETHEUS_AVAILABLE:
                self.neo4j_connection_errors.inc()
            return {'database_status': 'unhealthy', 'error': str(e)}
    
    def generate_health_report(self, driver):
        """Generate comprehensive health report"""
        try:
            system_metrics = self.collect_system_metrics()
            neo4j_metrics = self.collect_neo4j_metrics(driver)
            
            # Determine overall health
            health_status = 1  # healthy
            alerts = []
            
            # System health checks
            if system_metrics.get('cpu_percent', 0) > 80:
                health_status = 0
                alerts.append({
                    'severity': 'critical',
                    'message': f"High CPU usage: {system_metrics['cpu_percent']:.1f}%"
                })
            
            if system_metrics.get('memory_percent', 0) > 85:
                health_status = 0
                alerts.append({
                    'severity': 'critical',
                    'message': f"High memory usage: {system_metrics['memory_percent']:.1f}%"
                })
            
            if system_metrics.get('disk_percent', 0) > 90:
                health_status = 0
                alerts.append({
                    'severity': 'critical',
                    'message': f"High disk usage: {system_metrics['disk_percent']:.1f}%"
                })
            
            # Database health checks
            if neo4j_metrics.get('database_status') != 'healthy':
                health_status = 0
                alerts.append({
                    'severity': 'critical',
                    'message': f"Database unhealthy: {neo4j_metrics.get('error', 'Unknown error')}"
                })
            
            if neo4j_metrics.get('query_response_time', 0) > 0.5:
                alerts.append({
                    'severity': 'warning',
                    'message': f"Slow query response: {neo4j_metrics['query_response_time']:.3f}s"
                })
            
            if PROMETHEUS_AVAILABLE:
                self.service_health.set(health_status)
            
            return {
                'overall_health': 'healthy' if health_status == 1 else 'unhealthy',
                'system_metrics': system_metrics,
                'database_metrics': neo4j_metrics,
                'alerts': alerts,
                'timestamp': datetime.now().isoformat()
            }
            
        except Exception as e:
            print(f"Error generating health report: {e}")
            return {
                'overall_health': 'unhealthy',
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }

# Initialize monitoring system
monitoring_system = ProductionMonitoringSystem()
print("✓ Production monitoring system initialized")

## Test System Metrics Collection

Collect and display current system metrics.

In [None]:
print("📊 Collecting System Metrics...\n")

metrics = monitoring_system.collect_system_metrics()

print(f"CPU Usage: {metrics['cpu_percent']:.2f}%")
print(f"Memory Usage: {metrics['memory_percent']:.2f}%")
print(f"Disk Usage: {metrics['disk_percent']:.2f}%")
print(f"Timestamp: {metrics['timestamp']}")

if metrics.get('simulated'):
    print("\n⚠️ Note: Metrics are simulated (install psutil for real metrics)")

print("\n✅ System metrics collected successfully")

## Logging and Audit System

Implement enterprise logging with audit trail capabilities.

In [None]:
class ProductionLoggingSystem:
    """Enterprise logging and audit system"""
    
    def __init__(self):
        # Configure logging
        self.setup_logging()
        self.audit_log = []
        self.max_audit_entries = 10000
    
    def setup_logging(self):
        """Configure production logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler()
            ]
        )
        
        # Create specialized loggers
        self.app_logger = logging.getLogger('insurance_app')
        self.security_logger = logging.getLogger('security')
        self.audit_logger = logging.getLogger('audit')
        self.performance_logger = logging.getLogger('performance')
    
    def log_user_action(self, user_id: str, action: str, resource: str, details: dict = None):
        """Log user actions for audit trail"""
        audit_entry = {
            'timestamp': datetime.now().isoformat(),
            'user_id': user_id,
            'action': action,
            'resource': resource,
            'details': details or {},
            'ip_address': details.get('ip_address') if details else None,
            'user_agent': details.get('user_agent') if details else None
        }
        
        self.audit_log.append(audit_entry)
        
        # Maintain audit log size
        if len(self.audit_log) > self.max_audit_entries:
            self.audit_log = self.audit_log[-self.max_audit_entries:]
        
        # Log to file
        self.audit_logger.info(f"User action: {json.dumps(audit_entry)}")
    
    def log_security_event(self, event_type: str, details: dict):
        """Log security-related events"""
        security_entry = {
            'timestamp': datetime.now().isoformat(),
            'event_type': event_type,
            'details': details,
            'severity': details.get('severity', 'medium')
        }
        
        self.security_logger.warning(f"Security event: {json.dumps(security_entry)}")
    
    def log_performance_metric(self, metric_name: str, value: float, context: dict = None):
        """Log performance metrics"""
        perf_entry = {
            'timestamp': datetime.now().isoformat(),
            'metric': metric_name,
            'value': value,
            'context': context or {}
        }
        
        self.performance_logger.info(f"Performance: {json.dumps(perf_entry)}")
    
    def get_audit_trail(self, user_id: str = None, hours: int = 24) -> list:
        """Get audit trail for specified user and time period"""
        cutoff_time = datetime.now() - timedelta(hours=hours)
        
        filtered_entries = []
        for entry in self.audit_log:
            entry_time = datetime.fromisoformat(entry['timestamp'])
            
            if entry_time >= cutoff_time:
                if user_id is None or entry['user_id'] == user_id:
                    filtered_entries.append(entry)
        
        return sorted(filtered_entries, key=lambda x: x['timestamp'], reverse=True)

# Initialize logging system
logging_system = ProductionLoggingSystem()
print("✓ Production logging and audit system configured")

## Test Logging System

Test various logging capabilities including audit trails and security events.

In [None]:
print("📝 Testing Logging System...\n")

# Log user actions
logging_system.log_user_action(
    user_id="admin_001",
    action="create_policy",
    resource="insurance_policy",
    details={
        'ip_address': '192.168.1.100',
        'user_agent': 'Mozilla/5.0',
        'policy_id': 'POL-12345'
    }
)
print("✓ User action logged")

# Log security event
logging_system.log_security_event(
    event_type="failed_login",
    details={
        'severity': 'high',
        'username': 'unknown_user',
        'ip_address': '203.0.113.42',
        'attempts': 3
    }
)
print("✓ Security event logged")

# Log performance metric
logging_system.log_performance_metric(
    metric_name="query_duration",
    value=0.125,
    context={
        'query_type': 'read',
        'endpoint': '/api/policies'
    }
)
print("✓ Performance metric logged")

# Add more test entries
for i in range(5):
    logging_system.log_user_action(
        user_id=f"user_{i:03d}",
        action="view_claims",
        resource="claims",
        details={'ip_address': f'192.168.1.{100+i}'}
    )

print(f"\n✅ Logging system operational")
print(f"📊 Total audit entries: {len(logging_system.audit_log)}")

## Retrieve Audit Trail

Query and display the audit trail for monitoring user activities.

In [None]:
print("🔍 Retrieving Audit Trail...\n")

# Get all recent audit entries
recent_audit = logging_system.get_audit_trail(hours=1)

print(f"Recent Audit Entries (last 1 hour): {len(recent_audit)}\n")

for entry in recent_audit[:5]:  # Show first 5
    print(f"[{entry['timestamp']}]")
    print(f"  User: {entry['user_id']}")
    print(f"  Action: {entry['action']}")
    print(f"  Resource: {entry['resource']}")
    if entry.get('details'):
        print(f"  Details: {entry['details']}")
    print()

# Get audit trail for specific user
user_audit = logging_system.get_audit_trail(user_id="admin_001", hours=24)
print(f"\n📋 Audit entries for admin_001: {len(user_audit)}")

## Generate Health Report

Create a comprehensive health report combining system and database metrics.

**Note:** This requires a Neo4j connection. Update the connection details as needed.

In [None]:
print("🏥 Generating Health Report...\n")

if NEO4J_AVAILABLE:
    try:
        # Connect to Neo4j (update credentials as needed)
        driver = GraphDatabase.driver(
            "bolt://localhost:7687",
            auth=("neo4j", "password")
        )
        
        # Generate health report
        health_report = monitoring_system.generate_health_report(driver)
        
        print(f"Overall Health: {health_report['overall_health'].upper()}")
        print(f"Timestamp: {health_report['timestamp']}")
        
        # System metrics
        if 'system_metrics' in health_report:
            sys_metrics = health_report['system_metrics']
            print(f"\n🖥️  System Metrics:")
            print(f"  CPU: {sys_metrics.get('cpu_percent', 0):.2f}%")
            print(f"  Memory: {sys_metrics.get('memory_percent', 0):.2f}%")
            print(f"  Disk: {sys_metrics.get('disk_percent', 0):.2f}%")
        
        # Database metrics
        if 'database_metrics' in health_report:
            db_metrics = health_report['database_metrics']
            print(f"\n🗄️  Database Metrics:")
            print(f"  Status: {db_metrics.get('database_status', 'unknown')}")
            print(f"  Response Time: {db_metrics.get('query_response_time', 0)*1000:.2f}ms")
            print(f"  Nodes: {db_metrics.get('node_count', 0):,}")
            print(f"  Relationships: {db_metrics.get('relationship_count', 0):,}")
        
        # Alerts
        if health_report.get('alerts'):
            print(f"\n🚨 Alerts:")
            for alert in health_report['alerts']:
                severity_icon = "🔴" if alert['severity'] == 'critical' else "🟡"
                print(f"  {severity_icon} {alert['message']}")
        else:
            print(f"\n✅ No active alerts")
        
        driver.close()
        
    except Exception as e:
        print(f"⚠️ Could not connect to Neo4j: {e}")
        print("Please update connection details or skip this section")
else:
    print("⚠️ Neo4j driver not available - install neo4j package")

print("\n✅ Health report generated")

## Key Takeaways

In this notebook, you've:
1. ✅ Implemented production monitoring system with Prometheus
2. ✅ Created system metrics collection (CPU, memory, disk)
3. ✅ Built Neo4j database metrics monitoring
4. ✅ Configured enterprise logging and audit system
5. ✅ Generated comprehensive health reports

**Monitoring Best Practices:**
- Collect metrics at regular intervals (30-60 seconds)
- Set appropriate alert thresholds for critical metrics
- Maintain audit logs for compliance and security
- Use structured logging (JSON) for easier analysis
- Implement health checks for all critical components
- Monitor both system and application-specific metrics

**Next Steps:** Proceed to notebook 04 for Backup and Disaster Recovery implementation.