<a href="https://colab.research.google.com/github/kareemullah123456789/big_data_advanced/blob/main/advance_pyspark_unedited.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Advanced PySpark Tutorial: Performance Optimization
# Topics: Partitioning, Joins, Caching, File Formats, Memory, Catalyst Optimizer

!pip install pyspark pyngrok delta-spark

import time
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import StorageLevel
from pyspark.sql.window import Window
from delta import configure_spark_with_delta_pip
import pandas as pd

# Configure Spark with Delta Lake and optimization settings
builder = SparkSession.builder \
    .appName("AdvancedPySparkTutorial") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("🚀 Advanced PySpark Tutorial Started!")
print(f"Spark Version: {spark.version}")

# =============================================================================
# 1. PARTITIONING STRATEGIES 🔄
# =============================================================================

def demo_partitioning_strategies():
    """Comprehensive partitioning demo with performance analysis"""
    print("\n" + "="*60)
    print("🔄 PARTITIONING STRATEGIES DEMO")
    print("="*60)

    # Create sample datasets
    print("📊 Creating sample datasets...")

    # Large dataset for partitioning tests
    large_data = [(i, f"user_{i%1000}", i * 10, i % 50, f"2023-{(i%12)+1:02d}-01")
                  for i in range(1, 500001)]  # 500K records

    df_large = spark.createDataFrame(large_data,
                                   ["id", "user_name", "amount", "category", "date"])

    print(f"✅ Created dataset with {df_large.count():,} records")
    print(f"🔢 Current partitions: {df_large.rdd.getNumPartitions()}")

    # Measure performance for different partitioning strategies
    def measure_operation(name, operation):
        start = time.time()
        result = operation()
        duration = time.time() - start
        print(f"   {name}: {duration:.3f}s")
        return result, duration

    print("\n🎯 Testing Different Partitioning Strategies:")

    # 1. Default partitioning performance
    print("\n1️⃣ Default Partitioning:")
    _, time_default = measure_operation(
        "Filter + GroupBy",
        lambda: df_large.filter(col("amount") > 1000000).groupBy("category").count().count()
    )

    # 2. Repartition vs Coalesce
    print("\n2️⃣ Repartition vs Coalesce:")

    # Repartition (full shuffle)
    df_repartitioned = df_large.repartition(20)
    _, time_repartition = measure_operation(
        "After repartition(20)",
        lambda: df_repartitioned.filter(col("amount") > 1000000).groupBy("category").count().count()
    )

    # Coalesce (minimize shuffle)
    df_coalesced = df_large.coalesce(8)
    _, time_coalesce = measure_operation(
        "After coalesce(8)",
        lambda: df_coalesced.filter(col("amount") > 1000000).groupBy("category").count().count()
    )

    # 3. Hash Partitioning
    print("\n3️⃣ Hash Partitioning by Category:")
    df_hash_partitioned = df_large.repartition(col("category"))
    _, time_hash = measure_operation(
        "Hash partitioned GroupBy",
        lambda: df_hash_partitioned.groupBy("category").count().count()
    )

    # 4. Range Partitioning
    print("\n4️⃣ Range Partitioning by ID:")
    df_range_partitioned = df_large.repartitionByRange(10, col("id"))
    _, time_range = measure_operation(
        "Range partitioned filter",
        lambda: df_range_partitioned.filter((col("id") >= 100000) & (col("id") <= 200000)).count()
    )

    # 5. Partition Skew Detection
    print("\n5️⃣ Partition Skew Analysis:")

    # Create skewed data
    skewed_data = ([(i, f"user_{i}", i * 10, 1, "2023-01-01") for i in range(1, 400001)] +  # 80% in category 1
                   [(i, f"user_{i}", i * 10, j, "2023-01-01") for i in range(400001, 500001) for j in range(2, 6)])  # 20% in categories 2-5

    df_skewed = spark.createDataFrame(skewed_data, ["id", "user_name", "amount", "category", "date"])

    # Analyze partition sizes
    def analyze_partition_skew(df, name):
        partition_sizes = df.rdd.mapPartitions(lambda x: [sum(1 for _ in x)]).collect()
        print(f"   {name} partition sizes: {partition_sizes}")
        print(f"   Skew ratio: {max(partition_sizes) / min(partition_sizes) if min(partition_sizes) > 0 else 'inf':.2f}")

    analyze_partition_skew(df_skewed, "Skewed data")

    # Fix skew with salting technique
    df_salted = df_skewed.withColumn("salt", (rand() * 10).cast("int")) \
                         .repartition(col("category"), col("salt"))

    analyze_partition_skew(df_salted, "After salting")

    return {
        'default': time_default,
        'repartition': time_repartition,
        'coalesce': time_coalesce,
        'hash': time_hash,
        'range': time_range
    }

# =============================================================================
# 2. JOIN OPTIMIZATIONS 🔗
# =============================================================================

def demo_join_optimizations():
    """Comprehensive join optimization strategies"""
    print("\n" + "="*60)
    print("🔗 JOIN OPTIMIZATIONS DEMO")
    print("="*60)

    # Create datasets for join demos
    print("📊 Creating datasets for join optimization...")

    # Large fact table
    fact_data = [(i, f"product_{i%1000}", i * 100, i % 50) for i in range(1, 100001)]
    df_large = spark.createDataFrame(fact_data, ["order_id", "product_id", "amount", "customer_id"])

    # Small dimension table (perfect for broadcast)
    dim_data = [(i, f"Customer_{i}", f"City_{i%10}") for i in range(1, 51)]
    df_small = spark.createDataFrame(dim_data, ["customer_id", "customer_name", "city"])

    # Medium table for shuffle join demo
    medium_data = [(i, f"Product_{i}", f"Category_{i%20}") for i in range(1, 1001)]
    df_medium = spark.createDataFrame(medium_data, ["product_id", "product_name", "category"])

    print(f"✅ Large table: {df_large.count():,} records")
    print(f"✅ Small table: {df_small.count():,} records")
    print(f"✅ Medium table: {df_medium.count():,} records")

    def measure_join(name, join_operation):
        start = time.time()
        result = join_operation().count()
        duration = time.time() - start
        print(f"   {name}: {duration:.3f}s ({result:,} results)")
        return duration

    print("\n🎯 Testing Different Join Strategies:")

    # 1. Broadcast Join (automatic)
    print("\n1️⃣ Broadcast Join (Small Table):")
    time_broadcast = measure_join(
        "Auto broadcast join",
        lambda: df_large.join(df_small, "customer_id")
    )

    # 2. Explicit Broadcast Hint
    print("\n2️⃣ Explicit Broadcast Hint:")
    time_explicit_broadcast = measure_join(
        "Explicit broadcast hint",
        lambda: df_large.join(broadcast(df_small), "customer_id")
    )

    # 3. Shuffle Join (larger tables)
    print("\n3️⃣ Shuffle Join (Medium Tables):")
    time_shuffle = measure_join(
        "Shuffle join",
        lambda: df_large.join(df_medium, "product_id")
    )

    # 4. Bucketed Join Optimization
    print("\n4️⃣ Bucketed Join Setup:")

    # Save as bucketed tables (in production, you'd write to persistent storage)
    print("   Creating bucketed tables...")

    # Bucket both tables by join key
    df_large_bucketed = df_large.repartition(8, col("customer_id"))
    df_small_bucketed = df_small.repartition(8, col("customer_id"))

    time_bucketed = measure_join(
        "Pre-bucketed join",
        lambda: df_large_bucketed.join(df_small_bucketed, "customer_id")
    )

    # 5. Join with Data Skew Handling
    print("\n5️⃣ Handling Join Skew:")

    # Create skewed join scenario
    skewed_large = df_large.filter(col("customer_id") <= 10)  # Most data joins to few keys

    # Traditional join (will be skewed)
    time_skewed = measure_join(
        "Skewed join (traditional)",
        lambda: skewed_large.join(df_small, "customer_id")
    )

    # Skew handling with salting
    df_salted = skewed_large.withColumn("salt", (rand() * 3).cast("int")) \
                           .withColumn("salted_key", concat(col("customer_id"), lit("_"), col("salt")))

    df_small_expanded = df_small.withColumn("salt", explode(array([lit(i) for i in range(3)]))) \
                              .withColumn("salted_key", concat(col("customer_id"), lit("_"), col("salt")))

    time_salted = measure_join(
        "Skew handled with salting",
        lambda: df_salted.join(df_small_expanded, "salted_key")
    )

    # 6. Different Join Types Performance
    print("\n6️⃣ Join Type Comparison:")

    join_types = ["inner", "left", "right", "outer"]
    join_times = {}

    for join_type in join_types:
        join_times[join_type] = measure_join(
            f"{join_type.capitalize()} join",
            lambda jt=join_type: df_large.join(df_small, "customer_id", jt)
        )

    return {
        'broadcast': time_broadcast,
        'explicit_broadcast': time_explicit_broadcast,
        'shuffle': time_shuffle,
        'bucketed': time_bucketed,
        'skewed': time_skewed,
        'salted': time_salted,
        'join_types': join_times
    }

# =============================================================================
# 3. ADVANCED CACHING STRATEGIES 💾
# =============================================================================

def demo_advanced_caching():
    """Advanced caching strategies beyond basic persist"""
    print("\n" + "="*60)
    print("💾 ADVANCED CACHING STRATEGIES")
    print("="*60)

    # Create test dataset
    data = [(i, f"user_{i%1000}", i * 10.5, i % 20, f"2023-{(i%12)+1:02d}")
            for i in range(1, 200001)]
    df = spark.createDataFrame(data, ["id", "name", "amount", "category", "month"])

    print(f"📊 Created dataset: {df.count():,} records")

    def test_storage_level(storage_level, name):
        """Test performance with different storage levels"""
        df_cached = df.persist(storage_level)

        start = time.time()
        # First operation (caches data)
        count1 = df_cached.count()
        cache_time = time.time() - start

        start = time.time()
        # Second operation (uses cache)
        count2 = df_cached.filter(col("amount") > 1000).count()
        use_time = time.time() - start

        # Memory usage (simplified)
        cached_info = "Cached" if df_cached.is_cached else "Not cached"

        print(f"   {name:20}: Cache={cache_time:.3f}s, Use={use_time:.3f}s, Status={cached_info}")

        df_cached.unpersist()
        return cache_time, use_time

    print("\n🎯 Storage Level Comparison:")

    storage_results = {}

    # Test different storage levels
    levels = [
        (StorageLevel.MEMORY_ONLY, "MEMORY_ONLY"),
        (StorageLevel.MEMORY_AND_DISK, "MEMORY_AND_DISK"),
        (StorageLevel.MEMORY_ONLY_SER, "MEMORY_ONLY_SER"),
        (StorageLevel.MEMORY_AND_DISK_SER, "MEMORY_AND_DISK_SER"),
        (StorageLevel.DISK_ONLY, "DISK_ONLY")
    ]

    for storage_level, name in levels:
        cache_time, use_time = test_storage_level(storage_level, name)
        storage_results[name] = {'cache': cache_time, 'use': use_time}

    print("\n💡 Storage Level Recommendations:")
    print("   • MEMORY_ONLY: Fast, but limited by memory size")
    print("   • MEMORY_AND_DISK: Balanced, most commonly used")
    print("   • *_SER: More memory efficient, slight CPU overhead")
    print("   • DISK_ONLY: Unlimited size, slowest access")

    # Cache Management Strategy
    print("\n🧠 Cache Management Strategies:")

    # 1. Cache Reuse Pattern
    print("\n1️⃣ Cache Reuse Analysis:")

    df_reuse = df.persist(StorageLevel.MEMORY_AND_DISK)

    operations = [
        ("Count", lambda: df_reuse.count()),
        ("Filter count", lambda: df_reuse.filter(col("amount") > 1000).count()),
        ("GroupBy", lambda: df_reuse.groupBy("category").sum("amount").count()),
        ("Window function", lambda: df_reuse.withColumn("rank",
                                   row_number().over(Window.partitionBy("category").orderBy("amount"))).count())
    ]

    for op_name, operation in operations:
        start = time.time()
        result = operation()
        duration = time.time() - start
        print(f"   {op_name:15}: {duration:.3f}s")

    df_reuse.unpersist()

    # 2. Memory Pressure Simulation
    print("\n2️⃣ Memory Pressure Handling:")

    # Cache multiple DataFrames to simulate memory pressure
    dfs_cached = []
    for i in range(3):
        df_temp = df.sample(0.3).persist(StorageLevel.MEMORY_ONLY)
        df_temp.count()  # Trigger caching
        dfs_cached.append(df_temp)
        print(f"   Cached DF {i+1}: {df_temp.is_cached}")

    # Clean up
    for df_temp in dfs_cached:
        df_temp.unpersist()

    # 3. Optimal Unpersist Timing
    print("\n3️⃣ Optimal Unpersist Strategy:")

    df_timed = df.persist()
    df_timed.count()  # Cache

    # Use cache multiple times
    for i in range(3):
        df_timed.filter(col("category") == i).count()

    print("   ✅ Used cache for multiple operations")

    # Unpersist when no longer needed
    df_timed.unpersist()
    print("   🧹 Unpersisted to free memory")

    return storage_results

# =============================================================================
# 4. FILE FORMAT OPTIMIZATION 📁
# =============================================================================

def demo_file_format_optimization():
    """File format and storage optimization"""
    print("\n" + "="*60)
    print("📁 FILE FORMAT OPTIMIZATION")
    print("="*60)

    # Create test data with different patterns
    data = [(i, f"user_{i%1000}", i * 10.5, i % 20, f"2023-{(i%12)+1:02d}",
             f"country_{i%50}", f"product_{i%100}")
            for i in range(1, 100001)]

    df = spark.createDataFrame(data, ["id", "name", "amount", "category", "date", "country", "product"])

    print(f"📊 Created dataset: {df.count():,} records")

    # File format comparison
    print("\n🎯 File Format Performance:")

    formats = ["json", "csv", "parquet"]
    format_times = {}

    for fmt in formats:
        # Write
        start = time.time()
        df.write.mode("overwrite").format(fmt).save(f"/tmp/test_data_{fmt}")
        write_time = time.time() - start

        # Read
        start = time.time()
        df_read = spark.read.format(fmt).load(f"/tmp/test_data_{fmt}")
        count = df_read.count()
        read_time = time.time() - start

        format_times[fmt] = {'write': write_time, 'read': read_time}
        print(f"   {fmt.upper():8}: Write={write_time:.3f}s, Read={read_time:.3f}s")

    # Parquet optimization techniques
    print("\n⚡ Parquet Optimization Techniques:")

    # 1. Column selection (projection pushdown)
    print("\n1️⃣ Column Projection:")
    start = time.time()
    result1 = spark.read.parquet("/tmp/test_data_parquet").select("id", "amount").count()
    time_projection = time.time() - start
    print(f"   Select 2 columns: {time_projection:.3f}s")

    start = time.time()
    result2 = spark.read.parquet("/tmp/test_data_parquet").count()
    time_full = time.time() - start
    print(f"   Select all columns: {time_full:.3f}s")

    # 2. Predicate pushdown
    print("\n2️⃣ Predicate Pushdown:")
    start = time.time()
    result3 = spark.read.parquet("/tmp/test_data_parquet").filter(col("amount") > 50000).count()
    time_filter = time.time() - start
    print(f"   With filter: {time_filter:.3f}s")

    # 3. Partitioned datasets
    print("\n3️⃣ Dataset Partitioning:")

    # Write partitioned data
    df.write.mode("overwrite").partitionBy("category").parquet("/tmp/partitioned_data")

    # Read with partition pruning
    start = time.time()
    result4 = spark.read.parquet("/tmp/partitioned_data").filter(col("category") == 5).count()
    time_partitioned = time.time() - start
    print(f"   Partitioned read: {time_partitioned:.3f}s")

    return {
        'formats': format_times,
        'projection': time_projection,
        'full': time_full,
        'filter': time_filter,
        'partitioned': time_partitioned
    }

# =============================================================================
# 5. MEMORY MANAGEMENT 🧠
# =============================================================================

def demo_memory_management():
    """Memory management and configuration"""
    print("\n" + "="*60)
    print("🧠 MEMORY MANAGEMENT")
    print("="*60)

    # Get current memory configuration
    conf = spark.sparkContext.getConf()

    print("📊 Current Memory Configuration:")
    memory_configs = [
        "spark.executor.memory",
        "spark.executor.memoryFraction",
        "spark.storage.memoryFraction",
        "spark.sql.execution.arrow.maxRecordsPerBatch"
    ]

    for config in memory_configs:
        value = conf.get(config, "Not set")
        print(f"   {config:35}: {value}")

    # Memory usage analysis
    print("\n🔍 Memory Usage Analysis:")

    # Create memory-intensive operations
    large_data = [(i, f"user_{i}", " ".join([f"word_{j}" for j in range(100)]))
                  for i in range(1, 10001)]

    df_memory = spark.createDataFrame(large_data, ["id", "name", "description"])

    # Test different memory patterns
    print("\n1️⃣ Memory-intensive Operations:")

    # Cache and monitor
    df_memory.persist(StorageLevel.MEMORY_ONLY)

    start = time.time()
    count = df_memory.count()
    cache_time = time.time() - start
    print(f"   Cache operation: {cache_time:.3f}s")

    # Memory pressure test
    start = time.time()
    result = df_memory.rdd.map(lambda x: (x[0], len(x[2]))).collect()
    collect_time = time.time() - start
    print(f"   Collect operation: {collect_time:.3f}s")

    df_memory.unpersist()

    # Garbage collection impact
    print("\n2️⃣ GC Impact Analysis:")

    # Create objects that stress GC
    for i in range(3):
        temp_df = df_memory.sample(0.5)
        temp_df.count()
        print(f"   Iteration {i+1}: Created temporary DataFrame")

    return {
        'cache_time': cache_time,
        'collect_time': collect_time
    }

# =============================================================================
# 6. CATALYST OPTIMIZER DEEP DIVE ⚡
# =============================================================================

def demo_catalyst_optimizer():
    """Understanding Catalyst optimizer behavior"""
    print("\n" + "="*60)
    print("⚡ CATALYST OPTIMIZER DEEP DIVE")
    print("="*60)

    # Create test data
    data = [(i, f"user_{i%100}", i * 10, i % 20, f"2023-{(i%12)+1:02d}")
            for i in range(1, 50001)]
    df = spark.createDataFrame(data, ["id", "name", "amount", "category", "date"])

    print(f"📊 Dataset: {df.count():,} records")

    # 1. Logical vs Physical Plans
    print("\n1️⃣ Query Plan Analysis:")

    query = df.filter(col("amount") > 1000) \
              .select("name", "amount", "category") \
              .groupBy("category") \
              .avg("amount")

    print("\n📋 Logical Plan:")
    query.explain(mode="simple")

    print("\n🔧 Physical Plan:")
    query.explain(mode="extended")

    # 2. Predicate Pushdown Demonstration
    print("\n2️⃣ Predicate Pushdown:")

    # Save as parquet to demonstrate pushdown
    df.write.mode("overwrite").parquet("/tmp/catalyst_demo")

    # Query with filter - should show pushdown in plan
    filtered_query = spark.read.parquet("/tmp/catalyst_demo") \
                          .filter(col("amount") > 5000) \
                          .select("name", "amount")

    print("🔍 Plan with predicate pushdown:")
    filtered_query.explain()

    # 3. Column Pruning
    print("\n3️⃣ Column Pruning:")

    pruned_query = spark.read.parquet("/tmp/catalyst_demo") \
                        .select("name", "amount")  # Only these columns read

    print("🔍 Plan with column pruning:")
    pruned_query.explain()

    # 4. Constant Folding
    print("\n4️⃣ Constant Folding:")

    # Catalyst will optimize constant expressions
    constant_query = df.select(
        col("name"),
        (lit(10) + lit(20)).alias("constant_sum"),  # Will be folded to 30
        (col("amount") * 2).alias("doubled_amount")
    )

    print("🔍 Plan with constant folding:")
    constant_query.explain()

    # 5. Join Optimization
    print("\n5️⃣ Join Optimization:")

    # Small table for broadcast join
    small_data = [(i, f"Category_{i}") for i in range(1, 21)]
    df_small = spark.createDataFrame(small_data, ["category", "category_name"])

    # Catalyst should automatically choose broadcast join
    join_query = df.join(df_small, "category")

    print("🔍 Automatic join optimization:")
    join_query.explain()

    # 6. Cost-Based Optimization (CBO)
    print("\n6️⃣ Cost-Based Optimization:")

    # Enable CBO and generate statistics
    spark.sql("SET spark.sql.cbo.enabled=true")

    # Create temporary view for SQL analysis
    df.createOrReplaceTempView("sales_data")
    df_small.createOrReplaceTempView("categories")

    # CBO will use statistics to choose optimal plan
    cbo_query = spark.sql("""
        SELECT c.category_name, AVG(s.amount) as avg_amount
        FROM sales_data s
        JOIN categories c ON s.category = c.category
        WHERE s.amount > 2000
        GROUP BY c.category_name
        ORDER BY avg_amount DESC
    """)

    print("🔍 Cost-based optimized plan:")
    cbo_query.explain()

    # 7. Adaptive Query Execution (AQE)
    print("\n7️⃣ Adaptive Query Execution:")

    print("🔧 AQE Configuration:")
    aqe_configs = [
        "spark.sql.adaptive.enabled",
        "spark.sql.adaptive.coalescePartitions.enabled",
        "spark.sql.adaptive.skewJoin.enabled"
    ]

    for config in aqe_configs:
        value = spark.conf.get(config, "Not set")
        print(f"   {config:40}: {value}")

    return True

# =============================================================================
# RUN ALL DEMOS
# =============================================================================

def run_all_demos():
    """Run all advanced PySpark demos"""
    print("🎬 Starting Advanced PySpark Tutorial")
    print("=" * 80)

    results = {}

    # Run each demo
    results['partitioning'] = demo_partitioning_strategies()
    results['joins'] = demo_join_optimizations()
    results['caching'] = demo_advanced_caching()
    results['formats'] = demo_file_format_optimization()
    results['memory'] = demo_memory_management()
    results['catalyst'] = demo_catalyst_optimizer()

    # Summary visualization
    print("\n" + "="*60)
    print("📊 PERFORMANCE SUMMARY")
    print("="*60)

    # Create summary charts
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Partitioning performance
    part_data = results['partitioning']
    axes[0,0].bar(part_data.keys(), part_data.values())
    axes[0,0].set_title('Partitioning Strategies Performance')
    axes[0,0].set_ylabel('Time (seconds)')
    axes[0,0].tick_params(axis='x', rotation=45)

    # Join performance
    join_data = {k: v for k, v in results['joins'].items() if k != 'join_types'}
    axes[0,1].bar(join_data.keys(), join_data.values())
    axes[0,1].set_title('Join Optimization Performance')
    axes[0,1].set_ylabel('Time (seconds)')
    axes[0,1].tick_params(axis='x', rotation=45)

    # Storage levels
    cache_data = results['caching']
    cache_times = [v['use'] for v in cache_data.values()]
    cache_names = list(cache_data.keys())
    axes[1,0].bar(cache_names, cache_times)
    axes[1,0].set_title('Storage Level Performance')
    axes[1,0].set_ylabel('Access Time (seconds)')
    axes[1,0].tick_params(axis='x', rotation=45)

    # File formats
    format_data = results['formats']['formats']
    format_names = list(format_data.keys())
    read_times = [v['read'] for v in format_data.values()]
    axes[1,1].bar(format_names, read_times)
    axes[1,1].set_title('File Format Read Performance')
    axes[1,1].set_ylabel('Read Time (seconds)')

    plt.tight_layout()
    plt.show()

    # Key takeaways
    print("\n🎯 KEY TAKEAWAYS:")
    print("1. Choose partitioning strategy based on access patterns")
    print("2. Use broadcast joins for small tables")
    print("3. MEMORY_AND_DISK is usually the best storage level")
    print("4. Parquet with partitioning offers best performance")
    print("5. Monitor memory usage and GC impact")
    print("6. Catalyst optimizer works best with proper statistics")

    return results

# =============================================================================
# INTERACTIVE EXPLORATION GUIDE
# =============================================================================

def create_interactive_guide():
    """Create interactive exploration guide"""
    print("\n" + "="*60)
    print("🎮 INTERACTIVE EXPLORATION GUIDE")
    print("="*60)

    print("""
🔍 WHAT TO EXPLORE IN SPARK UI:

1️⃣ PARTITIONING ANALYSIS:
   • Jobs tab → Look for shuffle operations
   • Stages tab → Check task distribution
   • Look for: Uneven task execution times (skew)

2️⃣ JOIN OPTIMIZATION:
   • SQL tab → Check join strategies in plans
   • Look for: "BroadcastHashJoin" vs "SortMergeJoin"
   • Stages tab → Shuffle read/write volumes

3️⃣ CACHE EFFECTIVENESS:
   • Storage tab → See cached DataFrames
   • SQL tab → "InMemoryTableScan" in plans
   • Jobs tab → Compare execution times

4️⃣ FILE FORMAT IMPACT:
   • SQL tab → "FileScan" operations
   • Look for: Predicate pushdown, column pruning
   • Stages tab → Input size differences

5️⃣ MEMORY MANAGEMENT:
   • Executors tab → Memory usage patterns
   • Look for: GC time, storage memory usage
   • Storage tab → Spill to disk indicators

6️⃣ CATALYST OPTIMIZATIONS:
   • SQL tab → Physical vs logical plans
   • Look for: Constant folding, predicate pushdown
   • Check: Join reordering, projection pushdown
    """)

def create_troubleshooting_guide():
    """Create troubleshooting guide for common issues"""
    print("\n" + "="*60)
    print("🔧 TROUBLESHOOTING GUIDE")
    print("="*60)

    print("""
❌ COMMON ISSUES & SOLUTIONS:

1️⃣ SLOW JOINS:
   Problem: Large shuffle operations
   Solutions:
   • Use broadcast() for small tables (<200MB)
   • Implement bucketing for repeated joins
   • Use salting for skewed joins
   • Check join keys for nulls

2️⃣ OUT OF MEMORY ERRORS:
   Problem: Executor memory exhaustion
   Solutions:
   • Increase executor memory
   • Use MEMORY_AND_DISK storage level
   • Reduce partition size with repartition()
   • Avoid collect() on large datasets

3️⃣ PARTITION SKEW:
   Problem: Few tasks take much longer
   Solutions:
   • Add salt columns for better distribution
   • Use repartitionByRange() for sorted data
   • Check for hot keys in your data
   • Consider custom partitioners

4️⃣ CACHE NOT HELPING:
   Problem: Cache overhead > benefit
   Solutions:
   • Only cache reused DataFrames
   • Choose appropriate storage level
   • Unpersist when no longer needed
   • Consider serialized storage levels

5️⃣ SLOW FILE READS:
   Problem: Inefficient file formats
   Solutions:
   • Use Parquet instead of JSON/CSV
   • Implement dataset partitioning
   • Enable predicate pushdown
   • Optimize file sizes (128MB-1GB)
    """)

def create_optimization_checklist():
    """Create optimization checklist"""
    print("\n" + "="*60)
    print("✅ OPTIMIZATION CHECKLIST")
    print("="*60)

    print("""
🚀 PERFORMANCE OPTIMIZATION CHECKLIST:

📊 DATA LAYOUT:
   ☐ Use Parquet format for analytical workloads
   ☐ Partition datasets by frequently filtered columns
   ☐ Optimize file sizes (avoid small files)
   ☐ Use appropriate compression (snappy/gzip)

🔄 PARTITIONING:
   ☐ Right-size partitions (128MB-1GB each)
   ☐ Use hash partitioning for joins
   ☐ Use range partitioning for ordered operations
   ☐ Avoid excessive repartitioning

🔗 JOINS:
   ☐ Broadcast small tables (<200MB)
   ☐ Use bucketing for repeated joins
   ☐ Handle data skew with salting
   ☐ Choose appropriate join types

💾 CACHING:
   ☐ Cache frequently accessed DataFrames
   ☐ Use MEMORY_AND_DISK for reliability
   ☐ Unpersist when no longer needed
   ☐ Monitor memory usage

⚡ CATALYST OPTIMIZATION:
   ☐ Enable adaptive query execution
   ☐ Use column pruning (select only needed columns)
   ☐ Push filters as early as possible
   ☐ Generate table statistics for CBO

🧠 MEMORY MANAGEMENT:
   ☐ Right-size executor memory
   ☐ Monitor GC overhead
   ☐ Use serialized storage when memory-constrained
   ☐ Avoid large broadcast variables

🔍 MONITORING:
   ☐ Regularly check Spark UI
   ☐ Monitor task execution times
   ☐ Watch for shuffle spill
   ☐ Track cache hit rates
    """)

def demo_real_world_scenarios():
    """Demonstrate real-world optimization scenarios"""
    print("\n" + "="*60)
    print("🌍 REAL-WORLD SCENARIOS")
    print("="*60)

    # Scenario 1: ETL Pipeline
    print("\n📦 Scenario 1: ETL Pipeline Optimization")

    # Simulate daily sales data processing
    daily_sales = [(i, f"2023-{(i%12)+1:02d}-{(i%28)+1:02d}",
                   f"product_{i%1000}", i * 10.5, f"store_{i%100}")
                  for i in range(1, 500001)]

    df_sales = spark.createDataFrame(daily_sales,
                                   ["transaction_id", "date", "product_id", "amount", "store_id"])

    # ETL steps with optimization
    print("🔧 Optimized ETL Steps:")

    # Step 1: Partition by date for time-series queries
    df_partitioned = df_sales.repartition(col("date"))

    # Step 2: Cache intermediate results
    df_enriched = df_partitioned.withColumn("year_month", substring("date", 1, 7)) \
                               .persist(StorageLevel.MEMORY_AND_DISK)

    # Step 3: Multiple aggregations using cache
    start = time.time()

    monthly_sales = df_enriched.groupBy("year_month").sum("amount")
    store_sales = df_enriched.groupBy("store_id").sum("amount")
    product_sales = df_enriched.groupBy("product_id").sum("amount")

    # Trigger computations
    monthly_count = monthly_sales.count()
    store_count = store_sales.count()
    product_count = product_sales.count()

    etl_time = time.time() - start
    print(f"   ✅ ETL completed in {etl_time:.3f}s")
    print(f"   📊 Generated {monthly_count} monthly, {store_count} store, {product_count} product aggregates")

    df_enriched.unpersist()

    # Scenario 2: Machine Learning Feature Engineering
    print("\n🤖 Scenario 2: ML Feature Engineering")

    # Simulate user behavior data
    user_data = [(i, f"user_{i%10000}", i % 100, (i * 7) % 24, i % 7)
                for i in range(1, 1000001)]

    df_users = spark.createDataFrame(user_data,
                                   ["session_id", "user_id", "page_views", "hour", "day_of_week"])

    # Feature engineering with window functions
    start = time.time()

    # Cache raw data for multiple feature computations
    df_users.persist(StorageLevel.MEMORY_AND_DISK)

    # Create features using window functions
    user_window = Window.partitionBy("user_id").orderBy("session_id")

    df_features = df_users.withColumn("prev_page_views",
                                    lag("page_views").over(user_window)) \
                         .withColumn("session_rank",
                                   row_number().over(user_window)) \
                         .withColumn("avg_page_views",
                                   avg("page_views").over(user_window.rowsBetween(-2, 0)))

    feature_count = df_features.count()
    feature_time = time.time() - start

    print(f"   ✅ Feature engineering completed in {feature_time:.3f}s")
    print(f"   🎯 Generated features for {feature_count:,} sessions")

    df_users.unpersist()

    # Scenario 3: Real-time Analytics Dashboard
    print("\n📈 Scenario 3: Real-time Analytics")

    # Simulate streaming-like batch processing
    batch_data = [(i, f"event_{i%50}", int(time.time()) + i, i % 10)
                 for i in range(1, 100001)]

    df_events = spark.createDataFrame(batch_data,
                                    ["event_id", "event_type", "timestamp", "user_segment"])

    # Real-time aggregations
    start = time.time()

    # Multiple real-time metrics
    hourly_events = df_events.withColumn("hour",
                                       from_unixtime("timestamp", "yyyy-MM-dd HH")) \
                           .groupBy("hour", "event_type") \
                           .count()

    segment_metrics = df_events.groupBy("user_segment") \
                             .agg(count("*").alias("event_count"),
                                 countDistinct("event_type").alias("unique_events"))

    # Execute in parallel
    hourly_count = hourly_events.count()
    segment_count = segment_metrics.count()

    dashboard_time = time.time() - start

    print(f"   ✅ Dashboard metrics computed in {dashboard_time:.3f}s")
    print(f"   📊 {hourly_count} hourly metrics, {segment_count} segment metrics")

    return {
        'etl_time': etl_time,
        'feature_time': feature_time,
        'dashboard_time': dashboard_time
    }

# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    # Setup ngrok for Spark UI
    try:
        from pyngrok import ngrok
        ngrok.set_auth_token("2pPrNsyDH2wB1rSvXmQH4fFYTpk_4n14U3YrsvxTVLLDx5D4v")
        public_url = ngrok.connect(4040)
        print(f"🌐 Spark UI available at: {public_url}")
    except:
        print("⚠️  Ngrok not available, use localhost:4040 for Spark UI")

    # Run all demos
    demo_results = run_all_demos()

    # Create guides
    create_interactive_guide()
    create_troubleshooting_guide()
    create_optimization_checklist()

    # Real-world scenarios
    scenario_results = demo_real_world_scenarios()

    # Final summary
    print("\n" + "="*80)
    print("🎓 ADVANCED PYSPARK TUTORIAL COMPLETED!")
    print("="*80)

    print(f"""
📚 TOPICS COVERED:
✅ Partitioning Strategies (repartition vs coalesce, custom partitioning)
✅ Join Optimizations (broadcast, bucketing, skew handling)
✅ Advanced Caching (storage levels, memory management)
✅ File Format Optimization (Parquet, partitioning, compression)
✅ Memory Management (GC tuning, memory fractions)
✅ Catalyst Optimizer (predicate pushdown, CBO, AQE)

🎯 NEXT STEPS:
1. Practice these techniques with your own datasets
2. Monitor performance using Spark UI
3. Apply optimization patterns to production workloads
4. Explore streaming and MLlib for advanced use cases

🔗 Keep the Spark UI open to explore execution plans and performance metrics!
    """)

    # Keep session alive for UI exploration
    print("\n💡 Session kept alive for UI exploration...")
    print("   Run spark.stop() when finished")

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting delta-spark
  Downloading delta_spark-4.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting pyspark
  Downloading pyspark-4.0.0.tar.gz (434.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.1/434.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.9 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Downloading delta_spark-4.0.0-py3-none-any.whl (39 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.0/203.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-4.0.0-py2.py3-none-any.whl si

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.