In [None]:
# Final Summary
print("🎉 TIME TRAVEL TUTORIAL COMPLETE!")
print("\\n✅ What You've Learned:")

accomplishments = [
    "Time travel fundamentals and snapshot concepts",
    "Multiple ways to query historical data (snapshot ID, timestamp)",
    "Rollback operations for data recovery",
    "Schema evolution compatibility with time travel",
    "Real-world analytical use cases",
    "Performance optimization techniques",
    "Snapshot management and cleanup procedures"
]

for i, accomplishment in enumerate(accomplishments, 1):
    print(f"   {i}. {accomplishment}")

print("\\n💡 TIME TRAVEL BEST PRACTICES:")

best_practices = {
    "🔍 Querying": [
        "Use snapshot IDs for better performance when possible",
        "Combine time travel with partition pruning",
        "Cache frequently accessed historical data",
        "Use projection pushdown to minimize data scanning"
    ],
    "🗄️ Snapshot Management": [
        "Define clear retention policies",
        "Automate snapshot cleanup procedures", 
        "Monitor metadata growth over time",
        "Balance history needs with storage costs"
    ],
    "🚨 Recovery Planning": [
        "Document recovery procedures",
        "Test rollback operations regularly",
        "Monitor table history for anomalies",
        "Establish RTO/RPO requirements"
    ],
    "⚡ Performance": [
        "Keep metadata size reasonable",
        "Use appropriate file sizes",
        "Consider compaction strategies",
        "Monitor query performance over time"
    ]
}

for category, practices in best_practices.items():
    print(f"\\n{category}:")
    for practice in practices:
        print(f"   • {practice}")

print("\\n🚀 Next Steps:")
next_steps = [
    "Practice with larger datasets",
    "Implement automated retention policies",
    "Explore advanced analytical patterns",
    "Integrate with your data pipeline"
]

for step in next_steps:
    print(f"   → {step}")

print("\\n🎯 Key Takeaway:")
print("   Time travel in Iceberg provides powerful capabilities for")
print("   data recovery, auditing, and historical analysis with")
print("   minimal performance and storage overhead!")

# Clean up
print("\\n🧹 Cleaning up demo environment...")
print("✅ Tutorial complete! Environment ready for your experiments.")

## 9. 🎉 Summary and Best Practices

Time travel tutorial summary and key takeaways.

In [None]:
# Demonstrate snapshot cleanup (expire old snapshots)
print("🧹 SNAPSHOT CLEANUP DEMONSTRATION")
print("\\n📸 Before cleanup - All snapshots:")
spark.sql("""
SELECT 
    snapshot_id,
    committed_at,
    operation
FROM time_travel_lab.customer_orders.snapshots 
ORDER BY committed_at
""").show()

# In production, you would expire old snapshots like this:
print("\\n🧹 Snapshot Cleanup Commands (for reference):")
cleanup_commands = [
    "-- Expire snapshots older than 7 days:",
    "CALL spark_catalog.system.expire_snapshots('time_travel_lab.customer_orders', TIMESTAMP '2024-01-08 00:00:00')",
    "",
    "-- Keep only last 5 snapshots:",
    "CALL spark_catalog.system.expire_snapshots('time_travel_lab.customer_orders', retain_last => 5)",
    "",
    "-- Orphan file cleanup:",
    "CALL spark_catalog.system.remove_orphan_files('time_travel_lab.customer_orders')"
]

for cmd in cleanup_commands:
    print(cmd)

print("\\n⚠️ Note: In this demo, we keep all snapshots for learning purposes")
print("\\n💡 Production Recommendations:")
prod_recommendations = [
    "Set up automated snapshot cleanup jobs",
    "Define retention policies based on business needs",
    "Monitor storage growth and costs",
    "Test recovery procedures regularly",
    "Document time travel usage patterns"
]

for rec in prod_recommendations:
    print(f"✓ {rec}")

In [None]:
# Performance Analysis
print("⚡ TIME TRAVEL PERFORMANCE CONSIDERATIONS")

# 1. Snapshot Retention
print("\\n📸 1. SNAPSHOT MANAGEMENT:")
print("Current snapshot count:")
snapshot_count = spark.sql("SELECT COUNT(*) as count FROM time_travel_lab.customer_orders.snapshots").collect()[0]['count']
print(f"Total snapshots: {snapshot_count}")

print("\\n💡 Snapshot Retention Best Practices:")
retention_tips = [
    "Keep only necessary snapshots for time travel",
    "Use expire_snapshots procedure to clean old snapshots", 
    "Balance between history needs and metadata overhead",
    "Consider business and compliance requirements",
    "Monitor metadata size vs data size ratio"
]

for tip in retention_tips:
    print(f"✓ {tip}")

# 2. Query Performance Tips
print("\\n🚀 2. QUERY PERFORMANCE TIPS:")
performance_tips = [
    "Use snapshot IDs instead of timestamps when possible",
    "Snapshot queries are faster than timestamp queries",
    "Combine time travel with partition pruning",
    "Use projection pushdown to reduce data scanning",
    "Cache frequently accessed historical snapshots"
]

for tip in performance_tips:
    print(f"✓ {tip}")

# 3. Storage Impact
print("\\n💾 3. STORAGE IMPACT:")
files_info = spark.sql("""
SELECT 
    COUNT(*) as total_files,
    SUM(file_size_in_bytes) / 1024 / 1024 as total_size_mb,
    AVG(file_size_in_bytes) / 1024 / 1024 as avg_file_size_mb
FROM time_travel_lab.customer_orders.files
""")

files_info.show()

print("Storage optimization tips:")
storage_tips = [
    "Iceberg stores only changed data, not full copies",
    "Metadata overhead is minimal compared to data size",
    "File-level deduplication reduces storage costs",
    "Compaction helps optimize file sizes over time"
]

for tip in storage_tips:
    print(f"✓ {tip}")

## 8. ⚡ Performance Considerations

Best practices for time travel performance.

In [None]:
# Use Case 3: Data Recovery and Debugging
print("📊 USE CASE 3: Data Recovery and Debugging")
print("\\n🔧 Simulate an accidental data corruption and recovery:")

# Simulate accidental deletion
print("\\n⚠️ Simulating accidental data deletion:")
print("Current data before 'accident':")
spark.sql("SELECT COUNT(*) as count FROM customer_orders").show()

# Accidentally delete all pending orders
spark.sql("DELETE FROM customer_orders WHERE status = 'pending'")

print("\\n💥 After accidental deletion:")
remaining_data = spark.sql("SELECT * FROM customer_orders ORDER BY order_id")
remaining_data.show()
print(f"Records remaining: {remaining_data.count()}")

# Recovery using time travel
print("\\n🚨 EMERGENCY RECOVERY PROCEDURE:")
print("1. Identify the problem")
print("2. Find the last good snapshot")
print("3. Rollback to restore data")

# Find the snapshot before deletion
recovery_snapshots = spark.sql("""
SELECT snapshot_id, committed_at, operation
FROM time_travel_lab.customer_orders.snapshots 
ORDER BY committed_at DESC
""")

print("\\n📸 Available snapshots for recovery:")
recovery_snapshots.show()

# Get the snapshot before the DELETE operation
snapshots_list = recovery_snapshots.collect()
if len(snapshots_list) >= 2:
    # Second most recent (before the DELETE)
    recovery_snapshot_id = snapshots_list[1]['snapshot_id']
    print(f"\\n🔄 Recovering to snapshot: {recovery_snapshot_id}")
    
    # Rollback to recover
    spark.sql(f"CALL spark_catalog.system.rollback_to_snapshot('time_travel_lab.customer_orders', {recovery_snapshot_id})")
    
    print("\\n✅ DATA RECOVERED!")
    recovered_data = spark.sql("SELECT * FROM customer_orders ORDER BY order_id")
    recovered_data.show()
    print(f"Records after recovery: {recovered_data.count()}")

print("\\n💡 Recovery Benefits:")
print("✓ Instant recovery from any point in time")
print("✓ No need for external backups")
print("✓ Minimal downtime")
print("✓ Granular recovery options")

In [None]:
# Use Case 2: Change Data Capture (CDC) Analysis
print("📊 USE CASE 2: Change Data Capture Analysis")
print("\\n🔍 Analyzing what changed between snapshots:")

if len(all_snapshots) >= 2:
    snapshot1_id = all_snapshots[0]['snapshot_id']
    snapshot2_id = all_snapshots[1]['snapshot_id']
    
    print(f"\\n📸 Comparing Snapshot {snapshot1_id} vs {snapshot2_id}")
    
    # Data at first snapshot
    data1 = spark.sql(f"""
    SELECT order_id, status, 'snapshot1' as source
    FROM time_travel_lab.customer_orders
    VERSION AS OF {snapshot1_id}
    """)
    
    # Data at second snapshot  
    data2 = spark.sql(f"""
    SELECT order_id, status, 'snapshot2' as source
    FROM time_travel_lab.customer_orders
    VERSION AS OF {snapshot2_id}
    """)
    
    # Find changes
    print("\\n🔄 Status changes between snapshots:")
    changes = spark.sql(f"""
    WITH snapshot1 AS (
        SELECT order_id, status as status1
        FROM time_travel_lab.customer_orders
        VERSION AS OF {snapshot1_id}
    ),
    snapshot2 AS (
        SELECT order_id, status as status2
        FROM time_travel_lab.customer_orders
        VERSION AS OF {snapshot2_id}
    )
    SELECT 
        s1.order_id,
        s1.status1 as old_status,
        s2.status2 as new_status,
        'Status Changed' as change_type
    FROM snapshot1 s1
    JOIN snapshot2 s2 ON s1.order_id = s2.order_id
    WHERE s1.status1 != s2.status2
    """)
    
    changes.show()

print("\\n💡 CDC Use Cases:")
print("✓ Track data lineage and audit changes")
print("✓ Build change streams for downstream systems")
print("✓ Identify data quality issues")
print("✓ Monitor business process changes")

In [None]:
# Use Case 1: Point-in-time reporting
print("📊 USE CASE 1: Point-in-time Reporting")
print("\\n📅 Generate daily snapshots for reporting:")

# Simulate daily reporting snapshots
for i, snapshot in enumerate(all_snapshots[:3], 1):
    snapshot_id = snapshot['snapshot_id']
    snapshot_time = snapshot['committed_at']
    
    print(f"\\n📊 Daily Report {i} - {snapshot_time}")
    
    daily_report = spark.sql(f"""
    SELECT 
        COUNT(*) as total_orders,
        SUM(quantity * unit_price) as total_revenue,
        COUNT(DISTINCT customer_id) as unique_customers,
        AVG(quantity * unit_price) as avg_order_value
    FROM time_travel_lab.customer_orders
    VERSION AS OF {snapshot_id}
    """)
    
    daily_report.show()

print("\\n💡 Benefits:")
print("✓ Consistent reporting across time")
print("✓ Audit trail for financial reports") 
print("✓ Compare metrics across different time periods")

## 7. 📊 Analytical Use Cases

Real-world analytical scenarios using time travel.

In [None]:
# Time travel with schema evolution
print("⏰ TIME TRAVEL WITH SCHEMA EVOLUTION")

# Query old snapshot - schema compatibility
if len(snapshot_ids) > 0:
    print(f"\\n📸 Querying old snapshot {snapshot_ids[0]} (before schema change):")
    old_data_query = f"""
    SELECT * FROM time_travel_lab.customer_orders
    VERSION AS OF {snapshot_ids[0]}
    ORDER BY order_id
    """
    old_data = spark.sql(old_data_query) 
    old_data.show()
    
    print("\\n💡 Notice:")
    print("✓ Old snapshots show NULL for new columns")
    print("✓ Schema evolution is backward compatible")
    print("✓ You can query any historical snapshot regardless of schema changes")

# Show the evolution of the schema over time
print("\\n📋 SCHEMA EVOLUTION HISTORY:")
print("1. Original: order_id, customer_id, product_name, quantity, unit_price, order_date, status")
print("2. Current: + customer_email (added later)")
print("\\n✅ Time travel works seamlessly across schema versions!")

In [None]:
# Add a new column to demonstrate schema evolution
print("🔧 SCHEMA EVOLUTION DEMO")
print("\\n📋 Current schema:")
spark.sql("DESCRIBE customer_orders").show()

# Add a new column
print("\\n➕ Adding new column 'customer_email':")
spark.sql("ALTER TABLE customer_orders ADD COLUMN customer_email string")

print("\\n📋 New schema:")
spark.sql("DESCRIBE customer_orders").show()

# Insert data with the new column
print("\\n📝 Inserting data with new column:")
spark.sql("""
INSERT INTO customer_orders VALUES
    (1007, 106, 'Headphones', 1, 199.99, DATE '2024-01-18', 'pending', 'customer106@email.com')
""")

print("\\n📊 Current data with new column:")
spark.sql("SELECT * FROM customer_orders ORDER BY order_id").show(truncate=False)

## 6. 🔧 Schema Evolution with Time Travel

Learn how time travel works with schema changes.

In [None]:
# View snapshots after rollback
print("📸 SNAPSHOTS AFTER ROLLBACK:")
spark.sql("""
SELECT 
    snapshot_id,
    committed_at,
    operation,
    summary
FROM time_travel_lab.customer_orders.snapshots 
ORDER BY committed_at
""").show(truncate=False)

print("\\n💡 Key Points about Rollback:")
print("✓ Rollback creates a new snapshot")
print("✓ Original snapshots are still preserved") 
print("✓ You can still time-travel to any historical state")
print("✓ Rollback is metadata operation - very fast")
print("✓ No data files are actually deleted")

In [None]:
# Rollback to the second snapshot (after updates but before new orders)
if len(snapshot_ids) >= 2:
    rollback_snapshot_id = snapshot_ids[1]  # Second snapshot
    print(f"🔄 Rolling back to snapshot: {rollback_snapshot_id}")
    
    rollback_query = f"""
    CALL spark_catalog.system.rollback_to_snapshot('time_travel_lab.customer_orders', {rollback_snapshot_id})
    """
    
    spark.sql(rollback_query)
    print("✅ Rollback completed!")
    
    # Show data after rollback
    print("\\n📊 DATA AFTER ROLLBACK:")
    rollback_data = spark.sql("SELECT * FROM customer_orders ORDER BY order_id")
    rollback_data.show()
    print(f"Record count after rollback: {rollback_data.count()}")
    
    # Compare with what we expected
    print("\\n💡 Notice:")
    print("- The new orders (1004-1006) from Day 3 are gone")
    print("- We're back to the state after Day 2 updates")
    print("- Order 1001 status is 'shipped' (not 'pending')")
    print("- Order 1002 status is 'delivered' (not 'shipped')")

In [None]:
# Show current state before rollback
print("📊 CURRENT STATE (before rollback):")
current_data = spark.sql("SELECT * FROM customer_orders ORDER BY order_id")
current_data.show()
print(f"Current record count: {current_data.count()}")

# Show available snapshots
print("\\n📸 Available snapshots for rollback:")
spark.sql("""
SELECT 
    snapshot_id,
    committed_at,
    operation,
    summary
FROM time_travel_lab.customer_orders.snapshots 
ORDER BY committed_at
""").show(truncate=False)

## 5. 🔄 Rollback Operations

Learn how to rollback tables to previous states.

In [None]:
# Method 2: Query by Timestamp
print("🔍 METHOD 2: Query by Timestamp")

# Get timestamp from second snapshot
if len(all_snapshots) >= 2:
    second_timestamp = all_snapshots[1]['committed_at']
    print(f"\\n📅 Data as of timestamp: {second_timestamp}")
    
    timestamp_query = f"""
    SELECT * FROM time_travel_lab.customer_orders
    TIMESTAMP AS OF '{second_timestamp}'
    ORDER BY order_id
    """
    spark.sql(timestamp_query).show()
    
    print(f"📊 Count at that timestamp: {spark.sql(f'''SELECT COUNT(*) as count FROM time_travel_lab.customer_orders TIMESTAMP AS OF '{second_timestamp}' ''').collect()[0]['count']}")

# Method 3: Query with relative time
print("\\n🔍 METHOD 3: Query with system functions")
print("\\n📅 Data from 5 minutes ago (if available):")

# This would work in a real scenario with longer time gaps
relative_time_query = """
SELECT * FROM time_travel_lab.customer_orders
TIMESTAMP AS OF CURRENT_TIMESTAMP() - INTERVAL 5 MINUTES
ORDER BY order_id
"""

try:
    spark.sql(relative_time_query).show()
except Exception as e:
    print(f"⚠️ Note: {str(e)}")
    print("💡 This is expected in our demo due to short time intervals")

In [None]:
# Method 1: Query by Snapshot ID
print("🔍 METHOD 1: Query by Snapshot ID")
print("\\n📸 Data at first snapshot (original orders):")

if len(snapshot_ids) > 0:
    first_snapshot_query = f"""
    SELECT * FROM time_travel_lab.customer_orders
    VERSION AS OF {snapshot_ids[0]}
    ORDER BY order_id
    """
    spark.sql(first_snapshot_query).show()
    
    print(f"📊 Count at first snapshot: {spark.sql(f'SELECT COUNT(*) as count FROM time_travel_lab.customer_orders VERSION AS OF {snapshot_ids[0]}').collect()[0]['count']}")

# Compare with current data
print("\\n📊 Current data for comparison:")
spark.sql("SELECT * FROM customer_orders ORDER BY order_id").show()
print(f"📊 Current count: {spark.sql('SELECT COUNT(*) as count FROM customer_orders').collect()[0]['count']}")

## 4. ⏰ Time Travel Queries

Learn different ways to query historical data.

In [None]:
# View table history with more details
print("📋 DETAILED TABLE HISTORY:")
history_df = spark.sql("""
SELECT 
    made_current_at,
    snapshot_id,
    parent_id,
    is_current_ancestor
FROM time_travel_lab.customer_orders.history
ORDER BY made_current_at
""")

history_df.show()

# View files for each snapshot
print("\n📁 FILES PER SNAPSHOT:")
files_df = spark.sql("""
SELECT 
    snapshot_id,
    file_path,
    file_size_in_bytes,
    record_count
FROM time_travel_lab.customer_orders.files
""")

files_df.show(truncate=False)

In [None]:
# View all snapshots
print("📸 ALL SNAPSHOTS IN ORDER:")
snapshots_df = spark.sql("""
SELECT 
    snapshot_id,
    committed_at,
    operation,
    summary
FROM time_travel_lab.customer_orders.snapshots 
ORDER BY committed_at
""")

snapshots_df.show(truncate=False)

print(f"\n📊 Total snapshots: {snapshots_df.count()}")

# Store snapshot IDs for later use
all_snapshots = snapshots_df.collect()
snapshot_ids = [row['snapshot_id'] for row in all_snapshots]
print(f"🔢 Snapshot IDs: {snapshot_ids}")

## 3. 📸 Snapshot Management

Learn how to view and manage table snapshots.

In [None]:
time.sleep(2)

# Add new orders - Day 3
print("📅 Day 3: New orders added")
spark.sql("""
INSERT INTO customer_orders VALUES
    (1004, 104, 'Monitor 4K', 1, 499.99, DATE '2024-01-17', 'pending'),
    (1005, 105, 'Webcam HD', 1, 79.99, DATE '2024-01-17', 'pending'),
    (1006, 101, 'USB-C Hub', 1, 49.99, DATE '2024-01-17', 'shipped')
""")

# Store third snapshot info
third_snapshot = spark.sql("SELECT snapshot_id, committed_at FROM time_travel_lab.customer_orders.snapshots ORDER BY committed_at DESC LIMIT 1").collect()[0]
print(f"📸 Snapshot 3 ID: {third_snapshot['snapshot_id']}")
print(f"📅 Snapshot 3 Time: {third_snapshot['committed_at']}")

print("\n📊 Current data (latest):")
spark.sql("SELECT * FROM customer_orders ORDER BY order_id").show()

print(f"\n📈 Total orders now: {spark.sql('SELECT COUNT(*) as count FROM customer_orders').collect()[0]['count']}")

In [None]:
# Add a small delay to ensure different timestamps
time.sleep(2)

# Update some orders - Day 2
print("📅 Day 2: Order status updates")
spark.sql("""
UPDATE customer_orders 
SET status = 'shipped' 
WHERE order_id = 1001
""")

spark.sql("""
UPDATE customer_orders 
SET status = 'delivered' 
WHERE order_id = 1002
""")

# Store second snapshot info
second_snapshot = spark.sql("SELECT snapshot_id, committed_at FROM time_travel_lab.customer_orders.snapshots ORDER BY committed_at DESC LIMIT 1").collect()[0]
print(f"📸 Snapshot 2 ID: {second_snapshot['snapshot_id']}")
print(f"📅 Snapshot 2 Time: {second_snapshot['committed_at']}")

print("\n📊 Updated data:")
spark.sql("SELECT * FROM customer_orders ORDER BY order_id").show()

In [None]:
# Insert initial data - Day 1
print("📅 Day 1: Initial orders")
spark.sql("""
INSERT INTO customer_orders VALUES
    (1001, 101, 'Laptop Pro', 1, 1299.99, DATE '2024-01-15', 'pending'),
    (1002, 102, 'Wireless Mouse', 2, 29.99, DATE '2024-01-15', 'shipped'),
    (1003, 103, 'Keyboard', 1, 89.99, DATE '2024-01-15', 'pending')
""")

# Store first snapshot info
first_snapshot = spark.sql("SELECT snapshot_id, committed_at FROM time_travel_lab.customer_orders.snapshots ORDER BY committed_at LIMIT 1").collect()[0]
print(f"📸 Snapshot 1 ID: {first_snapshot['snapshot_id']}")
print(f"📅 Snapshot 1 Time: {first_snapshot['committed_at']}")

print("\n📊 Current data:")
spark.sql("SELECT * FROM customer_orders ORDER BY order_id").show()

In [None]:
# Create database for time travel demo
spark.sql("CREATE DATABASE IF NOT EXISTS time_travel_lab")
spark.sql("USE time_travel_lab")

# Drop table if exists (for clean demo)
spark.sql("DROP TABLE IF EXISTS customer_orders")

# Create sample table
spark.sql("""
CREATE TABLE customer_orders (
    order_id bigint,
    customer_id bigint,
    product_name string,
    quantity int,
    unit_price decimal(10,2),
    order_date date,
    status string
) USING ICEBERG
PARTITIONED BY (days(order_date))
""")

print("✅ Created customer_orders table")
print("📊 Table schema:")
spark.sql("DESCRIBE customer_orders").show()

## 2. 🗄️ Create Sample Data for Time Travel

Create a sample table and insert data over time to demonstrate time travel capabilities.

In [None]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
import time

# Set Python path for Spark consistency
os.environ['PYSPARK_PYTHON'] = '/opt/conda/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/conda/bin/python'

# Initialize Spark with Iceberg
spark = SparkSession.builder \
    .appName("IcebergTimeTravel") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hadoop") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "/opt/spark/work-dir/warehouse") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

print("⏰ Time Travel Tutorial Environment Ready!")
print(f"📅 Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("✅ Spark session initialized with Iceberg support")

# ⏰ Apache Iceberg Time Travel Tutorial

Welcome to the comprehensive Time Travel tutorial! In this notebook, you'll learn:

1. **Time Travel Fundamentals**
2. **Reading Historical Data**
3. **Snapshot Management**
4. **Rollback Operations**
5. **Schema Evolution with Time Travel**
6. **Performance Considerations**
7. **Real-world Use Cases**

## 📋 Prerequisites

- Completed the basic Iceberg tutorial
- Understanding of Iceberg table concepts
- Basic knowledge of Spark SQL

## 1. 🚀 Initialize Environment

Set up Spark with Iceberg for time travel operations.