# Worker Optimization Analysis

Analyze historical runs to understand optimal worker counts per source.

In [None]:
# Parameters
SOURCE_NAME = "anva_meeus"  # Change to your source name
LOOKBACK_RUNS = 10  # Number of recent runs to analyze

In [None]:
# Module fabric.bootstrap
# ---------------------
# This cell enables a flexible module loading strategy:
#
# PRODUCTION (default): The `Files/code` directory is empty. This function does nothing,
# and Python imports all modules from the stable, versioned Wheel in the Environment.
#
# DEVELOPMENT / HOTFIX: To bypass the 15-20 minute Fabric publish cycle for urgent fixes,
# upload individual .py files to `Files/code` in the Lakehouse. This function prepends
# that path to sys.path, so Python finds the override files first. All other modules
# continue to load from the Wheel - only the uploaded files are replaced.
#
# Usage: Keep `Files/code` empty for production stability. Use it only for rapid
# iteration during development or emergency hotfixes.

from modules.fabric_bootstrap import ensure_module_path
ensure_module_path()  # Now Python can find the rest

In [None]:
from modules.spark_session import get_or_create_spark_session
import pyspark.sql.functions as F

spark = get_or_create_spark_session(app_name="Worker_Optimization_Analysis")

## 1. Historical Runs Overview

In [None]:
# Get recent runs for this source
df_history = (
    spark.table("logs.bronze_run_summary")
    .filter(F.col("source") == SOURCE_NAME)
    .orderBy(F.col("run_start").desc())
    .limit(LOOKBACK_RUNS)
)

# Calculate throughput
df_analysis = df_history.select(
    F.col("run_id"),
    F.col("run_start").cast("timestamp").alias("run_time"),
    F.col("workers"),
    F.col("total_rows"),
    F.col("duration_seconds"),
    F.col("efficiency_pct"),
    (F.col("total_rows") / F.col("duration_seconds")).alias("throughput_rows_per_sec"),
    F.round((F.col("total_rows") / 1000000.0), 2).alias("total_rows_millions")
).orderBy(F.col("run_time").desc())

print(f"\nüìä Last {LOOKBACK_RUNS} runs for source: {SOURCE_NAME}\n")
df_analysis.show(LOOKBACK_RUNS, truncate=False)

## 2. Average Performance per Worker Count

In [None]:
# Aggregate metrics per worker count
df_by_workers = (
    df_analysis
    .groupBy("workers")
    .agg(
        F.count("*").alias("num_runs"),
        F.round(F.avg("throughput_rows_per_sec"), 0).alias("avg_throughput"),
        F.round(F.avg("efficiency_pct"), 1).alias("avg_efficiency_pct"),
        F.round(F.avg("duration_seconds"), 0).alias("avg_duration_sec"),
        F.round(F.avg("total_rows_millions"), 2).alias("avg_rows_millions")
    )
    .orderBy("workers")
)

print("\n‚ö° Performance comparison by worker count:\n")
df_by_workers.show(truncate=False)

# Find best configuration
best_throughput = df_by_workers.orderBy(F.col("avg_throughput").desc()).first()
best_efficiency = df_by_workers.orderBy(F.col("avg_efficiency_pct").desc()).first()

print(f"\nüèÜ Best throughput: {best_throughput['workers']} workers ‚Üí {best_throughput['avg_throughput']:.0f} rows/sec")
print(f"üèÜ Best efficiency: {best_efficiency['workers']} workers ‚Üí {best_efficiency['avg_efficiency_pct']:.1f}%")

## 3. Throughput vs Efficiency Trade-off

In [None]:
# Calculate speedup relative to 1 worker
baseline = df_by_workers.filter(F.col("workers") == 1).first()

if baseline:
    baseline_throughput = baseline["avg_throughput"]
    
    df_speedup = (
        df_by_workers
        .withColumn(
            "speedup_vs_1worker",
            F.round(F.col("avg_throughput") / baseline_throughput, 2)
        )
        .select(
            "workers",
            "num_runs",
            "avg_throughput",
            "speedup_vs_1worker",
            "avg_efficiency_pct",
            "avg_duration_sec"
        )
        .orderBy("workers")
    )
    
    print(f"\nüìà Speedup analysis (baseline: 1 worker = {baseline_throughput:.0f} rows/sec):\n")
    df_speedup.show(truncate=False)
else:
    print("\n‚ö†Ô∏è  No baseline (1 worker) data available for speedup calculation")

## 4. Worker Optimizer Recommendation

Test what the optimizer would recommend based on current history.

In [None]:
from modules.worker_utils import choose_worker_profile_from_history

print("\nü§ñ Worker Optimizer Recommendations:\n")

# Throughput optimization (Fabric/serverless)
rec_throughput = choose_worker_profile_from_history(
    spark=spark,
    source_name=SOURCE_NAME,
    summary_table="logs.bronze_run_summary",
    default_workers=8,
    min_workers=2,
    max_workers_cap=12,
    lookback_runs=3,
    optimize_for="throughput",
    debug=True
)

print(f"\n‚úì Throughput mode (Fabric): {rec_throughput} workers\n")
print("-" * 70)

# Efficiency optimization (on-prem cluster)
rec_efficiency = choose_worker_profile_from_history(
    spark=spark,
    source_name=SOURCE_NAME,
    optimize_for="efficiency",
    debug=True
)

print(f"\n‚úì Efficiency mode (Cluster): {rec_efficiency} workers\n")

## 5. Volume-based Analysis

Understand how data volume affects optimal worker count.

In [None]:
# Classify runs by volume
df_volume_analysis = (
    df_analysis
    .withColumn(
        "volume_category",
        F.when(F.col("total_rows") < 100000, "Tiny (<100k)")
        .when(F.col("total_rows") < 1000000, "Small (<1M)")
        .when(F.col("total_rows") < 10000000, "Medium (<10M)")
        .otherwise("Large (‚â•10M)")
    )
    .groupBy("volume_category", "workers")
    .agg(
        F.count("*").alias("runs"),
        F.round(F.avg("throughput_rows_per_sec"), 0).alias("avg_throughput"),
        F.round(F.avg("efficiency_pct"), 1).alias("avg_efficiency")
    )
    .orderBy("volume_category", "workers")
)

print("\nüì¶ Performance by data volume and worker count:\n")
df_volume_analysis.show(50, truncate=False)

## 6. Detailed Run Timeline

In [None]:
# Timeline view
df_timeline = (
    df_analysis
    .select(
        F.date_format("run_time", "yyyy-MM-dd HH:mm").alias("run_date"),
        "workers",
        "total_rows_millions",
        "duration_seconds",
        F.round("throughput_rows_per_sec", 0).alias("throughput"),
        F.round("efficiency_pct", 1).alias("efficiency")
    )
    .orderBy(F.col("run_date").desc())
)

print(f"\nüìÖ Timeline of last {LOOKBACK_RUNS} runs:\n")
df_timeline.show(LOOKBACK_RUNS, truncate=False)

## 7. Recommendations Summary

In [None]:
print("\n" + "=" * 80)
print("üìã OPTIMIZATION SUMMARY")
print("=" * 80)

# Get latest run info
latest = df_analysis.first()

print(f"\nSource: {SOURCE_NAME}")
print(f"Data analyzed: Last {df_analysis.count()} runs")
print(f"\nLatest run:")
print(f"  - Workers: {latest['workers']}")
print(f"  - Rows: {latest['total_rows_millions']:.2f}M")
print(f"  - Duration: {latest['duration_seconds']:.0f}s")
print(f"  - Throughput: {latest['throughput_rows_per_sec']:.0f} rows/s")
print(f"  - Efficiency: {latest['efficiency_pct']:.1f}%")

print(f"\nüéØ Optimizer Recommendations:")
print(f"  - Fabric (throughput):  {rec_throughput} workers")
print(f"  - Cluster (efficiency): {rec_efficiency} workers")

print(f"\nüìä Historical Best:")
print(f"  - Best throughput: {best_throughput['workers']} workers ({best_throughput['avg_throughput']:.0f} rows/s)")
print(f"  - Best efficiency: {best_efficiency['workers']} workers ({best_efficiency['avg_efficiency_pct']:.1f}%)")

# Calculate expected improvement
if latest['workers'] != rec_throughput:
    current_throughput = latest['throughput_rows_per_sec']
    target_row = df_by_workers.filter(F.col("workers") == rec_throughput).first()
    
    if target_row:
        expected_throughput = target_row['avg_throughput']
        improvement = ((expected_throughput - current_throughput) / current_throughput) * 100
        
        print(f"\nüí° Potential Improvement:")
        print(f"   Switching from {latest['workers']} to {rec_throughput} workers:")
        print(f"   Expected throughput gain: {improvement:+.1f}%")
        print(f"   ({current_throughput:.0f} ‚Üí {expected_throughput:.0f} rows/s)")

print("\n" + "=" * 80)