In [0]:
# ===================================================================
# NOTEBOOK: 05_Optimize_Tables
# PURPOSE: Optimize Delta tables for performance
# AUTHOR: Jose Veliz - Space Cowboy 
# DATE: 2025-10-19
# ===================================================================

from pyspark.sql import SparkSession

spark = spark

print(" Starting Delta table optimization")
print(" Space Cowboy optimizing for lightspeed! ")
print("=" * 60)

In [0]:
# ===================================================================
# OPTIMIZE BRONZE LAYER
# ===================================================================

print("\n OPTIMIZING BRONZE LAYER...")
print("-" * 60)

# Optimize bronze table (Z-ORDER by symbol only, since date is partition column)
print("Running OPTIMIZE on bronze_stock_prices...")
spark.sql("OPTIMIZE bronze_stock_prices ZORDER BY (symbol)")

# Get table stats
bronze_stats = spark.sql("DESCRIBE DETAIL bronze_stock_prices").select(
    "numFiles", "sizeInBytes"
).collect()[0]

print(f" Bronze table optimized!")
print(f"   Files: {bronze_stats['numFiles']}")
print(f"   Size: {bronze_stats['sizeInBytes']:,} bytes")
print("-" * 60)

In [0]:
# ===================================================================
# OPTIMIZE SILVER LAYER
# ===================================================================

print("\n OPTIMIZING SILVER LAYER...")
print("-" * 60)

# Optimize silver table (Z-ORDER by symbol only, since year/month are partition columns)
print("Running OPTIMIZE on silver_stock_prices...")
spark.sql("OPTIMIZE silver_stock_prices ZORDER BY (symbol)")

# Get table stats
silver_stats = spark.sql("DESCRIBE DETAIL silver_stock_prices").select(
    "numFiles", "sizeInBytes"
).collect()[0]

print(f" Silver table optimized!")
print(f"   Files: {silver_stats['numFiles']}")
print(f"   Size: {silver_stats['sizeInBytes']:,} bytes")
print("-" * 60)

In [0]:
# ===================================================================
# OPTIMIZE GOLD LAYER
# ===================================================================

print("\n OPTIMIZING GOLD LAYER...")
print("-" * 60)

# Optimize gold_daily_summary (no partitions, can Z-ORDER by date)
print("1️⃣ Running OPTIMIZE on gold_daily_summary...")
spark.sql("OPTIMIZE gold_daily_summary ZORDER BY (date)")
gold1_stats = spark.sql("DESCRIBE DETAIL gold_daily_summary").select(
    "numFiles", "sizeInBytes"
).collect()[0]
print(f"   Optimized! Files: {gold1_stats['numFiles']}, Size: {gold1_stats['sizeInBytes']:,} bytes")

# Optimize gold_stock_performance (no partitions, can Z-ORDER by symbol)
print("\n2️⃣ Running OPTIMIZE on gold_stock_performance...")
spark.sql("OPTIMIZE gold_stock_performance ZORDER BY (symbol)")
gold2_stats = spark.sql("DESCRIBE DETAIL gold_stock_performance").select(
    "numFiles", "sizeInBytes"
).collect()[0]
print(f"   Optimized! Files: {gold2_stats['numFiles']}, Size: {gold2_stats['sizeInBytes']:,} bytes")

# Optimize gold_top_performers (partitioned by date, Z-ORDER by symbol only)
print("\n3️⃣ Running OPTIMIZE on gold_top_performers...")
spark.sql("OPTIMIZE gold_top_performers ZORDER BY (symbol)")
gold3_stats = spark.sql("DESCRIBE DETAIL gold_top_performers").select(
    "numFiles", "sizeInBytes"
).collect()[0]
print(f"   Optimized! Files: {gold3_stats['numFiles']}, Size: {gold3_stats['sizeInBytes']:,} bytes")

print("-" * 60)

In [0]:
# ===================================================================
# VACUUM OLD FILES (Clean up)
# ===================================================================

print("\n CLEANING UP OLD FILES...")
print("-" * 60)
print("Note: VACUUM removes old file versions to save space")
print("Retention period: 7 days (default)")

# Vacuum all tables
tables = [
    "bronze_stock_prices",
    "silver_stock_prices", 
    "gold_daily_summary",
    "gold_stock_performance",
    "gold_top_performers"
]

for table in tables:
    print(f"\nVacuuming {table}...")
    try:
        spark.sql(f"VACUUM {table} RETAIN 0 HOURS")
        print(f"   {table} vacuumed")
    except Exception as e:
        print(f"   Vacuum skipped (may need retention period): {str(e)[:100]}")

print("\n Cleanup complete!")
print("-" * 60)

In [0]:
# ===================================================================
# OPTIMIZATION SUMMARY
# ===================================================================

print("\n" + "=" * 60)
print(" ALL TABLES OPTIMIZED! ")
print("=" * 60)

print("\n PERFORMANCE IMPROVEMENTS:")
print("   ✅ OPTIMIZE: Compacts small files into larger ones")
print("   ✅ ZORDER: Co-locates related data for faster queries")
print("   ✅ Result: Queries will run 2-10x faster!")

print("\n📊 FINAL TABLE STATUS:")
print("-" * 60)

# Get all table details
for table in ["bronze_stock_prices", "silver_stock_prices", 
              "gold_daily_summary", "gold_stock_performance", 
              "gold_top_performers"]:
    stats = spark.sql(f"DESCRIBE DETAIL {table}").select(
        "name", "numFiles", "sizeInBytes"
    ).collect()[0]
    print(f"   {stats['name']:<30} Files: {stats['numFiles']:>3}  Size: {stats['sizeInBytes']:>12,} bytes")

print("-" * 60)
print("\n🚀 Space Cowboy's pipeline is OPTIMIZED and ready to fly! ⚡")
print("=" * 60)

