# Lab 14 - Part 4: Backup and Disaster Recovery

## Overview
This notebook covers enterprise backup and disaster recovery including:
- Backup automation system
- Database backup procedures
- Disaster recovery planning
- Backup verification

**Duration:** 10 minutes  
**Prerequisites:** Parts 1, 2, and 3 completed

## Prerequisites

Import necessary modules and check for required libraries.

In [None]:
# Standard library imports
import os
import json
import shutil
from datetime import datetime, timedelta

# Check for scheduling library
try:
    import schedule
    SCHEDULE_AVAILABLE = True
    print("✓ Schedule library available")
except ImportError:
    SCHEDULE_AVAILABLE = False
    print("⚠️ Schedule not available - using basic scheduling")

# Mock prod_config if not available
try:
    prod_config
    print("✓ Production config loaded")
except NameError:
    class MockConfig:
        def get_config(self, env):
            return {
                "neo4j_uri": "bolt://localhost:7687",
                "neo4j_user": "neo4j",
                "neo4j_password": "password"
            }
    prod_config = MockConfig()
    print("⚠️ Using mock configuration")

# Mock logging_system if not available
try:
    logging_system
    print("✓ Logging system loaded")
except NameError:
    import logging
    class MockLoggingSystem:
        def __init__(self):
            self.app_logger = logging.getLogger('app')
        def log_user_action(self, *args, **kwargs):
            pass
    logging_system = MockLoggingSystem()
    print("⚠️ Using mock logging system")

## Backup Automation System

Create a comprehensive backup system with automated scheduling and retention policies.

In [None]:
class BackupAutomationSystem:
    """Automated backup and disaster recovery system"""
    
    def __init__(self, neo4j_config: dict):
        self.neo4j_config = neo4j_config
        self.backup_directory = "/tmp/neo4j_backups"  # Using /tmp for demo
        self.retention_days = 30
        self.backup_schedule = "daily"
        
        # Create backup directory
        os.makedirs(self.backup_directory, exist_ok=True)
        
        # Initialize backup history
        self.backup_history = []
    
    def create_database_backup(self) -> dict:
        """Create comprehensive database backup"""
        try:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            backup_name = f"neo4j_backup_{timestamp}"
            backup_path = os.path.join(self.backup_directory, backup_name)
            
            # Create backup directory
            os.makedirs(backup_path, exist_ok=True)
            
            print(f"Creating database backup: {backup_name}")
            
            # Simulate backup process (in real environment, this would execute the actual command)
            backup_info = {
                'backup_id': backup_name,
                'timestamp': datetime.now().isoformat(),
                'status': 'completed',
                'backup_path': backup_path,
                'size_mb': 250.5,  # Simulated size
                'duration_seconds': 45.2,
                'database_state': {
                    'nodes': 850,
                    'relationships': 1100,
                    'labels': 15,
                    'relationship_types': 12
                }
            }
            
            # Copy application configuration
            self._backup_application_config(backup_path)
            
            # Backup security configurations
            self._backup_security_config(backup_path)
            
            # Record backup in history
            self.backup_history.append(backup_info)
            
            # Clean old backups
            self._cleanup_old_backups()
            
            logging_system.app_logger.info(f"Backup completed successfully: {backup_name}")
            
            return backup_info
            
        except Exception as e:
            error_info = {
                'backup_id': f"failed_{timestamp}",
                'timestamp': datetime.now().isoformat(),
                'status': 'failed',
                'error': str(e)
            }
            
            logging_system.app_logger.error(f"Backup failed: {str(e)}")
            return error_info
    
    def _backup_application_config(self, backup_path: str):
        """Backup application configuration files"""
        config_backup_path = os.path.join(backup_path, "application_config")
        os.makedirs(config_backup_path, exist_ok=True)
        
        # Save production configuration
        config_file = os.path.join(config_backup_path, "production_config.json")
        try:
            config_data = prod_config.get_config("production")
            with open(config_file, 'w') as f:
                json.dump(config_data, f, indent=2)
        except:
            pass  # Skip if config not available
        
        # Save environment variables (sanitized)
        env_file = os.path.join(config_backup_path, "environment.json")
        sanitized_env = {k: v for k, v in os.environ.items() 
                        if not any(secret in k.lower() for secret in ['password', 'key', 'secret', 'token'])}
        
        with open(env_file, 'w') as f:
            json.dump(sanitized_env, f, indent=2)
    
    def _backup_security_config(self, backup_path: str):
        """Backup security configuration (encrypted)"""
        security_backup_path = os.path.join(backup_path, "security_config")
        os.makedirs(security_backup_path, exist_ok=True)
        
        # Backup user roles and permissions (passwords excluded)
        users_backup = {}
        try:
            # Try to access auth_system if available
            if 'auth_system' in globals():
                for user_id, user_data in auth_system.users.items():
                    users_backup[user_id] = {
                        'user_id': user_data['user_id'],
                        'username': user_data['username'],
                        'role': user_data['role'],
                        'permissions': user_data['permissions'],
                        'email': user_data['email'],
                        'department': user_data.get('department'),
                        'created_at': user_data['created_at'],
                        'active': user_data['active']
                    }
        except:
            users_backup = {'note': 'User data not available during backup'}
        
        users_file = os.path.join(security_backup_path, "users_config.json")
        with open(users_file, 'w') as f:
            json.dump(users_backup, f, indent=2)
    
    def _cleanup_old_backups(self):
        """Remove backups older than retention period"""
        cutoff_date = datetime.now() - timedelta(days=self.retention_days)
        
        # Filter backup history
        self.backup_history = [
            backup for backup in self.backup_history
            if datetime.fromisoformat(backup['timestamp']) >= cutoff_date
        ]
        
        # Remove old backup directories
        try:
            for item in os.listdir(self.backup_directory):
                item_path = os.path.join(self.backup_directory, item)
                if os.path.isdir(item_path):
                    # Parse timestamp from directory name
                    try:
                        timestamp_str = item.split('_')[-2] + '_' + item.split('_')[-1]
                        backup_date = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
                        
                        if backup_date < cutoff_date:
                            shutil.rmtree(item_path)
                            print(f"Removed old backup: {item}")
                    except (ValueError, IndexError):
                        continue
        except Exception as e:
            logging_system.app_logger.warning(f"Error cleaning old backups: {e}")
    
    def restore_from_backup(self, backup_id: str) -> dict:
        """Restore database from backup"""
        try:
            # Find backup in history
            backup_info = None
            for backup in self.backup_history:
                if backup['backup_id'] == backup_id:
                    backup_info = backup
                    break
            
            if not backup_info:
                raise ValueError(f"Backup not found: {backup_id}")
            
            backup_path = backup_info['backup_path']
            
            if not os.path.exists(backup_path):
                raise ValueError(f"Backup files not found: {backup_path}")
            
            print(f"Restoring from backup: {backup_id}")
            
            # Simulate restore process
            restore_info = {
                'restore_id': f"restore_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                'backup_id': backup_id,
                'timestamp': datetime.now().isoformat(),
                'status': 'completed',
                'duration_seconds': 120.5,
                'restored_state': backup_info['database_state']
            }
            
            logging_system.app_logger.info(f"Restore completed successfully from backup: {backup_id}")
            
            return restore_info
            
        except Exception as e:
            restore_info = {
                'restore_id': f"restore_failed_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                'backup_id': backup_id,
                'timestamp': datetime.now().isoformat(),
                'status': 'failed',
                'error': str(e)
            }
            
            logging_system.app_logger.error(f"Restore failed: {str(e)}")
            return restore_info
    
    def schedule_automated_backups(self):
        """Schedule automated backup jobs with fallback"""
        if SCHEDULE_AVAILABLE:
            if self.backup_schedule == "daily":
                schedule.every().day.at("02:00").do(self.create_database_backup)
            elif self.backup_schedule == "hourly":
                schedule.every().hour.do(self.create_database_backup)
            
            print(f"✓ Automated backups scheduled: {self.backup_schedule}")
        else:
            print(f"⚠️ Schedule library not available - backup scheduling simulated: {self.backup_schedule}")
    
    def get_backup_status(self) -> dict:
        """Get backup system status"""
        return {
            'total_backups': len(self.backup_history),
            'latest_backup': self.backup_history[-1] if self.backup_history else None,
            'backup_directory': self.backup_directory,
            'retention_days': self.retention_days,
            'schedule': self.backup_schedule,
            'disk_usage': self._get_backup_disk_usage()
        }
    
    def _get_backup_disk_usage(self) -> dict:
        """Calculate backup directory disk usage"""
        total_size = 0
        file_count = 0
        
        try:
            for root, dirs, files in os.walk(self.backup_directory):
                for file in files:
                    file_path = os.path.join(root, file)
                    if os.path.exists(file_path):
                        total_size += os.path.getsize(file_path)
                        file_count += 1
        except Exception as e:
            logging_system.app_logger.warning(f"Error calculating backup disk usage: {e}")
        
        return {
            'total_size_mb': round(total_size / (1024 * 1024), 2),
            'file_count': file_count
        }

# Initialize backup system
backup_system = BackupAutomationSystem(prod_config.get_config("production"))
print("✓ Backup automation system configured")
print(f"📁 Backup directory: {backup_system.backup_directory}")
print(f"📅 Retention period: {backup_system.retention_days} days")
print(f"⏰ Schedule: {backup_system.backup_schedule}")

## Create Database Backup

Create a comprehensive backup of the database and configuration.

In [None]:
print("💾 Creating Database Backup...\n")

# Create backup
backup_result = backup_system.create_database_backup()

if backup_result['status'] == 'completed':
    print("✅ Backup completed successfully!\n")
    print(f"Backup ID: {backup_result['backup_id']}")
    print(f"Timestamp: {backup_result['timestamp']}")
    print(f"Backup Path: {backup_result['backup_path']}")
    print(f"Size: {backup_result['size_mb']} MB")
    print(f"Duration: {backup_result['duration_seconds']} seconds")
    print(f"\nDatabase State:")
    for key, value in backup_result['database_state'].items():
        print(f"  {key}: {value}")
else:
    print(f"❌ Backup failed: {backup_result.get('error', 'Unknown error')}")

## Create Multiple Backups

Create several backups to simulate a backup history.

In [None]:
print("📦 Creating Multiple Backups...\n")

import time

for i in range(3):
    print(f"Creating backup {i+1}/3...")
    result = backup_system.create_database_backup()
    if result['status'] == 'completed':
        print(f"  ✓ {result['backup_id']}")
    time.sleep(1)  # Small delay between backups

print(f"\n✅ Created {len(backup_system.backup_history)} backups")

## View Backup Status

Display comprehensive backup system status and history.

In [None]:
print("📊 Backup System Status\n" + "="*50 + "\n")

status = backup_system.get_backup_status()

print(f"Total Backups: {status['total_backups']}")
print(f"Backup Directory: {status['backup_directory']}")
print(f"Retention Period: {status['retention_days']} days")
print(f"Schedule: {status['schedule']}")
print(f"\nDisk Usage:")
print(f"  Total Size: {status['disk_usage']['total_size_mb']} MB")
print(f"  File Count: {status['disk_usage']['file_count']}")

if status['latest_backup']:
    latest = status['latest_backup']
    print(f"\nLatest Backup:")
    print(f"  ID: {latest['backup_id']}")
    print(f"  Timestamp: {latest['timestamp']}")
    print(f"  Status: {latest['status']}")
    print(f"  Size: {latest['size_mb']} MB")

print("\n" + "="*50)

## List All Backups

Display detailed information about all available backups.

In [None]:
print("📋 Backup History\n" + "="*50 + "\n")

if backup_system.backup_history:
    for idx, backup in enumerate(backup_system.backup_history, 1):
        print(f"Backup #{idx}:")
        print(f"  ID: {backup['backup_id']}")
        print(f"  Time: {backup['timestamp']}")
        print(f"  Status: {backup['status']}")
        print(f"  Size: {backup['size_mb']} MB")
        print(f"  Duration: {backup['duration_seconds']}s")
        print()
else:
    print("No backups found")

print("="*50)

## Test Backup Restoration

Test the restoration process from a backup.

In [None]:
print("🔄 Testing Backup Restoration...\n")

if backup_system.backup_history:
    # Get the latest backup
    latest_backup = backup_system.backup_history[-1]
    backup_id = latest_backup['backup_id']
    
    print(f"Restoring from backup: {backup_id}\n")
    
    # Perform restore
    restore_result = backup_system.restore_from_backup(backup_id)
    
    if restore_result['status'] == 'completed':
        print("✅ Restore completed successfully!\n")
        print(f"Restore ID: {restore_result['restore_id']}")
        print(f"Backup ID: {restore_result['backup_id']}")
        print(f"Timestamp: {restore_result['timestamp']}")
        print(f"Duration: {restore_result['duration_seconds']} seconds")
        print(f"\nRestored State:")
        for key, value in restore_result['restored_state'].items():
            print(f"  {key}: {value}")
    else:
        print(f"❌ Restore failed: {restore_result.get('error', 'Unknown error')}")
else:
    print("⚠️ No backups available to restore")

## Schedule Automated Backups

Configure automated backup scheduling.

In [None]:
print("⏰ Configuring Automated Backup Schedule...\n")

# Schedule automated backups
backup_system.schedule_automated_backups()

print(f"\n✅ Automated backups configured")
print(f"\n📅 Backup Schedule Details:")
print(f"  Frequency: {backup_system.backup_schedule}")
print(f"  Time: 02:00 UTC (for daily backups)")
print(f"  Retention: {backup_system.retention_days} days")
print(f"  Directory: {backup_system.backup_directory}")

print(f"\n📝 Backup Best Practices:")
print(f"  ✓ Daily backups at low-traffic times")
print(f"  ✓ Retain backups for compliance period (30 days)")
print(f"  ✓ Store backups in separate physical location")
print(f"  ✓ Test restoration procedures regularly")
print(f"  ✓ Monitor backup success/failure")
print(f"  ✓ Encrypt backup data at rest")

## Disaster Recovery Planning

Document disaster recovery procedures and objectives.

In [None]:
disaster_recovery_plan = {
    "Recovery Time Objective (RTO)": "15 minutes",
    "Recovery Point Objective (RPO)": "5 minutes",
    "Backup Frequency": "Daily at 02:00 UTC",
    "Backup Retention": "30 days",
    "Hot Standby": "Yes - Secondary data center",
    "Automatic Failover": "Enabled",
    "Replication Lag": "<100ms",
    "Recovery Procedures": [
        "1. Assess disaster scope and impact",
        "2. Activate disaster recovery team",
        "3. Switch to hot standby if available",
        "4. If standby unavailable, restore from latest backup",
        "5. Verify data integrity post-restore",
        "6. Update DNS and routing if needed",
        "7. Perform smoke tests",
        "8. Monitor system stability",
        "9. Notify stakeholders of recovery completion"
    ],
    "Testing Schedule": "Quarterly disaster recovery drills",
    "Contact List": [
        "Platform Engineering Lead",
        "Database Administrator",
        "Security Team",
        "Executive Management"
    ]
}

print("🚨 DISASTER RECOVERY PLAN\n" + "="*60 + "\n")

for key, value in disaster_recovery_plan.items():
    if isinstance(value, list):
        print(f"{key}:")
        for item in value:
            print(f"  {item}")
    else:
        print(f"{key}: {value}")
    print()

print("="*60)

# Save disaster recovery plan
dr_plan_file = os.path.join(backup_system.backup_directory, "disaster_recovery_plan.json")
with open(dr_plan_file, 'w') as f:
    json.dump(disaster_recovery_plan, f, indent=2)

print(f"\n✅ Disaster recovery plan saved to: {dr_plan_file}")

## Key Takeaways

In this notebook, you've:
1. ✅ Implemented automated backup system
2. ✅ Created comprehensive database backups
3. ✅ Tested backup restoration procedures
4. ✅ Configured retention policies
5. ✅ Documented disaster recovery plan

**Backup and DR Best Practices:**
- Follow the 3-2-1 rule: 3 copies, 2 different media, 1 offsite
- Test restoration procedures regularly (at least quarterly)
- Automate backups to ensure consistency
- Monitor backup success/failure with alerts
- Document RTO and RPO objectives clearly
- Encrypt backups at rest and in transit
- Keep disaster recovery procedures up to date
- Train team members on recovery procedures

**Recovery Metrics:**
- **RTO (Recovery Time Objective):** 15 minutes
- **RPO (Recovery Point Objective):** 5 minutes
- **Backup Frequency:** Daily
- **Retention Period:** 30 days

**Next Steps:** Proceed to notebook 05 for CI/CD and Container Deployment.