# BenchBox Cost Analysis

This notebook provides comprehensive **cost tracking, estimation, and optimization** tools for cloud data warehouse benchmarking. Understanding the cost implications of your platform choices and workload patterns is critical for making informed decisions.

## What You'll Learn

- **Track actual costs** from benchmark runs on cloud platforms
- **Estimate costs** for different scale factors and workloads
- **Project costs** for production-scale usage
- **Optimize spending** with platform-specific recommendations
- **Set budget alerts** and cost thresholds
- **Calculate ROI** for platform migrations

## Supported Platforms

- **Databricks**: DBU consumption + compute costs
- **BigQuery**: On-demand vs reserved slots, per-TB pricing
- **Snowflake**: Credit consumption, warehouse sizing, storage
- **Redshift**: Node hours, concurrency scaling, storage
- **Local**: DuckDB and SQLite (infrastructure costs only)

## Prerequisites

Run benchmarks on cloud platforms using the platform-specific notebooks first.

## Expected Runtime

Cost analysis runs quickly on existing results:
- Data loading: **10-30 seconds**
- Analysis and projections: **30-60 seconds**
- Complete notebook: **1-2 minutes**

## 1. Installation & Setup

In [None]:
# Install BenchBox if not already installed
# !pip install benchbox

import json
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from benchbox import __version__

print(f"BenchBox version: {__version__}")
print(f"Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

In [None]:
# Cost Configuration
# **IMPORTANT**: Update these with your actual pricing

config = {
    # Results directories
    "results_dirs": {
        "databricks": "./benchmark_results/databricks",
        "bigquery": "./benchmark_results/bigquery",
        "snowflake": "./benchmark_results/snowflake",
        "redshift": "./benchmark_results/redshift",
    },
    # Databricks pricing
    "databricks": {
        "dbu_rate": 8.0,  # DBUs consumed per hour by cluster
        "dbu_cost": 0.40,  # Cost per DBU (Standard tier)
        "compute_cost": 0.15,  # EC2/compute cost per hour
        "storage_per_tb_month": 0.20,  # Delta Lake storage
    },
    # BigQuery pricing
    "bigquery": {
        "on_demand_per_tb": 5.00,  # On-demand per TB processed
        "slots_per_hour": 0.04,  # Flat-rate slots (per 100 slots)
        "active_storage_per_tb": 0.02,  # Active storage per TB/month
        "long_term_storage_per_tb": 0.01,  # >90 days inactive
    },
    # Snowflake pricing
    "snowflake": {
        "credit_cost": 2.00,  # Enterprise edition
        "storage_per_tb_month": 0.023,  # Storage cost
        "warehouse_credits": {  # Credits per hour by size
            "X-Small": 1,
            "Small": 2,
            "Medium": 4,
            "Large": 8,
            "X-Large": 16,
            "2X-Large": 32,
            "3X-Large": 64,
            "4X-Large": 128,
        },
    },
    # Redshift pricing
    "redshift": {
        "node_types": {
            "ra3.xlplus": 1.086,  # On-demand per node per hour
            "ra3.4xlarge": 3.26,
            "ra3.16xlarge": 13.04,
            "dc2.large": 0.25,
            "dc2.8xlarge": 4.80,
        },
        "storage_per_tb_month": 0.024,  # RA3 managed storage
        "concurrency_scaling_per_second": 0.000003,
    },
    # Budget settings
    "budget": {
        "monthly_limit": 1000.00,  # Total monthly budget (USD)
        "daily_limit": 50.00,  # Daily spending limit
        "alert_threshold": 0.80,  # Alert at 80% of budget
    },
    # Output directory
    "output_dir": "./cost_analysis",
}

# Create output directory
os.makedirs(config["output_dir"], exist_ok=True)

print("‚úÖ Configuration loaded")
print(f"üí∞ Monthly budget: ${config['budget']['monthly_limit']:.2f}")
print(f"‚ö†Ô∏è  Alert threshold: {config['budget']['alert_threshold'] * 100:.0f}%")

## 2. Load Benchmark Results with Cost Data

In [None]:
def load_results_with_metadata(platform: str, results_dir: str) -> List[Dict]:
    """Load all benchmark results for a platform with timestamps.

    Returns:
        List of result dictionaries with metadata
    """
    results_path = Path(results_dir)

    if not results_path.exists():
        print(f"‚ö†Ô∏è  No results found for {platform}")
        return []

    all_results = []

    for json_file in sorted(results_path.glob("*.json")):
        try:
            with open(json_file) as f:
                result = json.load(f)

            # Add file metadata
            result["_file"] = json_file.name
            result["_file_time"] = datetime.fromtimestamp(json_file.stat().st_mtime)
            result["_platform"] = platform

            all_results.append(result)

        except Exception as e:
            print(f"‚ö†Ô∏è  Error loading {json_file.name}: {e}")

    return all_results


# Load all results
all_platform_results = {}

for platform, results_dir in config["results_dirs"].items():
    results = load_results_with_metadata(platform, results_dir)
    if results:
        all_platform_results[platform] = results
        print(f"‚úÖ {platform.capitalize()}: {len(results)} benchmark runs")

total_runs = sum(len(r) for r in all_platform_results.values())
print(f"\nüìä Total benchmark runs: {total_runs}")

## 3. Calculate Costs per Benchmark Run

In [None]:
def calculate_databricks_cost(result: Dict, pricing: Dict) -> Dict:
    """Calculate Databricks cost breakdown."""
    total_time_ms = sum(
        qr.get("execution_time_ms", 0) for qr in result.get("query_results", []) if qr.get("success", False)
    )
    total_hours = total_time_ms / (1000 * 60 * 60)

    dbu_consumption = total_hours * pricing["dbu_rate"]
    dbu_cost = dbu_consumption * pricing["dbu_cost"]
    compute_cost = total_hours * pricing["compute_cost"]
    total_cost = dbu_cost + compute_cost

    return {
        "total_cost": total_cost,
        "dbu_cost": dbu_cost,
        "compute_cost": compute_cost,
        "dbu_consumption": dbu_consumption,
        "total_hours": total_hours,
    }


def calculate_bigquery_cost(result: Dict, pricing: Dict) -> Dict:
    """Calculate BigQuery cost (requires bytes processed from results)."""
    # Try to get actual bytes processed from results
    total_bytes = result.get("total_bytes_processed", 0)

    if total_bytes == 0:
        # Estimate based on scale factor (rough approximation)
        scale = result.get("scale_factor", 0.1)
        # TPC-H SF 1 ‚âà 1GB, typical query scans ~20% of data
        total_bytes = scale * 1e9 * 0.2 * len(result.get("query_results", []))

    tb_processed = total_bytes / 1e12
    query_cost = tb_processed * pricing["on_demand_per_tb"]

    return {
        "total_cost": query_cost,
        "query_cost": query_cost,
        "tb_processed": tb_processed,
        "bytes_processed": total_bytes,
    }


def calculate_snowflake_cost(result: Dict, pricing: Dict) -> Dict:
    """Calculate Snowflake cost."""
    total_time_ms = sum(
        qr.get("execution_time_ms", 0) for qr in result.get("query_results", []) if qr.get("success", False)
    )
    total_hours = total_time_ms / (1000 * 60 * 60)

    # Get warehouse size from result or assume X-Small
    warehouse_size = result.get("warehouse_size", "X-Small")
    credits_per_hour = pricing["warehouse_credits"].get(warehouse_size, 1)

    credits_consumed = total_hours * credits_per_hour
    compute_cost = credits_consumed * pricing["credit_cost"]

    return {
        "total_cost": compute_cost,
        "compute_cost": compute_cost,
        "credits_consumed": credits_consumed,
        "total_hours": total_hours,
        "warehouse_size": warehouse_size,
    }


def calculate_redshift_cost(result: Dict, pricing: Dict) -> Dict:
    """Calculate Redshift cost."""
    total_time_ms = sum(
        qr.get("execution_time_ms", 0) for qr in result.get("query_results", []) if qr.get("success", False)
    )
    total_hours = total_time_ms / (1000 * 60 * 60)

    # Get node type and count from result or use defaults
    node_type = result.get("node_type", "ra3.xlplus")
    node_count = result.get("node_count", 2)
    node_cost_per_hour = pricing["node_types"].get(node_type, 1.0)

    compute_cost = total_hours * node_count * node_cost_per_hour

    return {
        "total_cost": compute_cost,
        "compute_cost": compute_cost,
        "total_hours": total_hours,
        "node_type": node_type,
        "node_count": node_count,
    }


# Calculate costs for all results
cost_calculators = {
    "databricks": calculate_databricks_cost,
    "bigquery": calculate_bigquery_cost,
    "snowflake": calculate_snowflake_cost,
    "redshift": calculate_redshift_cost,
}

cost_records = []

for platform, results in all_platform_results.items():
    calculator = cost_calculators.get(platform)
    if not calculator:
        continue

    platform_config = config.get(platform, {})

    for result in results:
        try:
            cost_data = calculator(result, platform_config)

            record = {
                "platform": platform,
                "timestamp": result["_file_time"],
                "benchmark": result.get("benchmark_name", "unknown"),
                "scale_factor": result.get("scale_factor", 0),
                "num_queries": len(result.get("query_results", [])),
                "successful_queries": sum(1 for qr in result.get("query_results", []) if qr.get("success", False)),
                **cost_data,
            }
            cost_records.append(record)

        except Exception as e:
            print(f"‚ö†Ô∏è  Error calculating cost for {platform}: {e}")

df_costs = pd.DataFrame(cost_records)

if len(df_costs) > 0:
    print(f"üìä Calculated costs for {len(df_costs)} benchmark runs")
    print("\nTotal spending by platform:")
    print(df_costs.groupby("platform")["total_cost"].sum().apply(lambda x: f"${x:.2f}"))
else:
    print("‚ö†Ô∏è  No cost data available")

## 4. Cost Breakdown Analysis

In [None]:
if len(df_costs) > 0:
    # Summary statistics
    cost_summary = (
        df_costs.groupby("platform")
        .agg(
            {
                "total_cost": ["sum", "mean", "median", "min", "max"],
                "num_queries": "sum",
                "successful_queries": "sum",
            }
        )
        .round(4)
    )

    cost_summary.columns = ["_".join(col).strip() for col in cost_summary.columns.values]

    # Calculate cost per query
    cost_summary["cost_per_query"] = (cost_summary["total_cost_sum"] / cost_summary["successful_queries_sum"]).round(6)

    print("üí∞ Cost Summary by Platform\n")
    print(cost_summary)

    print("\nüìä Cost Metrics:")
    for platform in cost_summary.index:
        total = cost_summary.loc[platform, "total_cost_sum"]
        per_query = cost_summary.loc[platform, "cost_per_query"]
        print(f"  {platform.capitalize()}: ${total:.2f} total, ${per_query:.6f} per query")
else:
    print("‚ö†Ô∏è  Insufficient data for cost analysis")

In [None]:
# Visualization: Cost breakdown
if len(df_costs) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Plot 1: Total cost by platform
    ax1 = axes[0]
    platform_totals = df_costs.groupby("platform")["total_cost"].sum().sort_values(ascending=False)

    colors = ["#4285F4", "#FF3621", "#29B5E8", "#CC0000"][: len(platform_totals)]
    bars = ax1.bar(range(len(platform_totals)), platform_totals.values, color=colors, alpha=0.8)
    ax1.set_xticks(range(len(platform_totals)))
    ax1.set_xticklabels([p.capitalize() for p in platform_totals.index], rotation=45, ha="right")
    ax1.set_ylabel("Total Cost (USD)", fontsize=11)
    ax1.set_title("Total Benchmark Costs by Platform", fontsize=12, fontweight="bold")
    ax1.grid(axis="y", alpha=0.3)

    # Add value labels
    for bar, value in zip(bars, platform_totals.values):
        ax1.text(
            bar.get_x() + bar.get_width() / 2,
            value + max(platform_totals.values) * 0.02,
            f"${value:.2f}",
            ha="center",
            va="bottom",
            fontsize=10,
            fontweight="bold",
        )

    # Plot 2: Cost per query
    ax2 = axes[1]
    cost_per_query = (
        df_costs.groupby("platform")["total_cost"].sum() / df_costs.groupby("platform")["successful_queries"].sum()
    ).sort_values()

    bars = ax2.barh(range(len(cost_per_query)), cost_per_query.values, color=colors, alpha=0.8)
    ax2.set_yticks(range(len(cost_per_query)))
    ax2.set_yticklabels([p.capitalize() for p in cost_per_query.index])
    ax2.set_xlabel("Cost per Query (USD)", fontsize=11)
    ax2.set_title("Cost Efficiency by Platform\n(Lower is Better)", fontsize=12, fontweight="bold")
    ax2.grid(axis="x", alpha=0.3)

    # Add value labels
    for bar, value in zip(bars, cost_per_query.values):
        ax2.text(
            value + max(cost_per_query.values) * 0.02,
            bar.get_y() + bar.get_height() / 2,
            f"${value:.6f}",
            va="center",
            fontsize=10,
            fontweight="bold",
        )

    plt.tight_layout()
    plt.savefig(f"{config['output_dir']}/cost_breakdown.png", dpi=150, bbox_inches="tight")
    plt.show()

    print(f"üíæ Saved: {config['output_dir']}/cost_breakdown.png")

## 5. Cost Projections and Extrapolation

Project costs for different usage scenarios based on benchmark data.

In [None]:
def project_monthly_cost(platform: str, cost_per_query: float, queries_per_day: int) -> Dict:
    """Project monthly costs based on daily query volume.

    Args:
        platform: Platform name
        cost_per_query: Average cost per query
        queries_per_day: Expected daily query volume

    Returns:
        Dictionary with cost projections
    """
    daily_cost = cost_per_query * queries_per_day
    monthly_cost = daily_cost * 30
    yearly_cost = monthly_cost * 12

    return {
        "platform": platform,
        "queries_per_day": queries_per_day,
        "cost_per_query": cost_per_query,
        "daily_cost": daily_cost,
        "monthly_cost": monthly_cost,
        "yearly_cost": yearly_cost,
    }


# Define usage scenarios
scenarios = [
    {"name": "Light (100 queries/day)", "queries_per_day": 100},
    {"name": "Moderate (500 queries/day)", "queries_per_day": 500},
    {"name": "Heavy (1,000 queries/day)", "queries_per_day": 1000},
    {"name": "Enterprise (5,000 queries/day)", "queries_per_day": 5000},
]

if len(df_costs) > 0:
    projections = []

    for scenario in scenarios:
        print(f"\nüìä {scenario['name']} Usage Scenario")
        print("=" * 70)

        for platform in df_costs["platform"].unique():
            platform_data = df_costs[df_costs["platform"] == platform]
            avg_cost_per_query = platform_data["total_cost"].sum() / platform_data["successful_queries"].sum()

            projection = project_monthly_cost(platform, avg_cost_per_query, scenario["queries_per_day"])
            projection["scenario"] = scenario["name"]
            projections.append(projection)

            print(f"  {platform.capitalize()}:")
            print(f"    Daily: ${projection['daily_cost']:.2f}")
            print(f"    Monthly: ${projection['monthly_cost']:.2f}")
            print(f"    Yearly: ${projection['yearly_cost']:.2f}")

    df_projections = pd.DataFrame(projections)
else:
    print("‚ö†Ô∏è  Insufficient data for projections")

In [None]:
# Visualization: Cost projections across scenarios
if len(df_costs) > 0 and "df_projections" in locals():
    fig, ax = plt.subplots(figsize=(12, 6))

    platforms = df_projections["platform"].unique()
    scenario_names = [s["name"] for s in scenarios]
    x = np.arange(len(scenario_names))
    width = 0.8 / len(platforms)

    colors = {"databricks": "#FF3621", "bigquery": "#4285F4", "snowflake": "#29B5E8", "redshift": "#CC0000"}

    for i, platform in enumerate(platforms):
        platform_data = df_projections[df_projections["platform"] == platform]
        monthly_costs = platform_data["monthly_cost"].values

        bars = ax.bar(
            x + i * width,
            monthly_costs,
            width,
            label=platform.capitalize(),
            color=colors.get(platform, "#888888"),
            alpha=0.8,
        )

    ax.set_xlabel("Usage Scenario", fontsize=12)
    ax.set_ylabel("Projected Monthly Cost (USD)", fontsize=12)
    ax.set_title("Monthly Cost Projections by Platform and Usage", fontsize=14, fontweight="bold", pad=15)
    ax.set_xticks(x + width * (len(platforms) - 1) / 2)
    ax.set_xticklabels(scenario_names, rotation=15, ha="right")
    ax.legend(loc="upper left")
    ax.grid(axis="y", alpha=0.3)

    # Add budget line if configured
    if config["budget"]["monthly_limit"] > 0:
        ax.axhline(
            config["budget"]["monthly_limit"],
            color="red",
            linestyle="--",
            linewidth=2,
            alpha=0.7,
            label=f"Budget Limit (${config['budget']['monthly_limit']:.0f})",
        )
        ax.legend(loc="upper left")

    plt.tight_layout()
    plt.savefig(f"{config['output_dir']}/cost_projections.png", dpi=150, bbox_inches="tight")
    plt.show()

    print(f"üíæ Saved: {config['output_dir']}/cost_projections.png")

## 6. Budget Tracking and Alerts

In [None]:
def check_budget_status(df_costs: pd.DataFrame, budget_config: Dict) -> Dict:
    """Check current spending against budget limits."""

    # Calculate spending for current month
    now = datetime.now()
    month_start = datetime(now.year, now.month, 1)

    if "timestamp" in df_costs.columns:
        monthly_costs = df_costs[df_costs["timestamp"] >= month_start]
    else:
        monthly_costs = df_costs  # Use all data if no timestamps

    total_monthly_spend = monthly_costs["total_cost"].sum()
    monthly_limit = budget_config["monthly_limit"]
    alert_threshold = budget_config["alert_threshold"]

    spend_pct = (total_monthly_spend / monthly_limit) * 100 if monthly_limit > 0 else 0
    remaining = monthly_limit - total_monthly_spend

    alert_level = None
    if spend_pct >= 100:
        alert_level = "CRITICAL"
    elif spend_pct >= alert_threshold * 100:
        alert_level = "WARNING"
    else:
        alert_level = "OK"

    return {
        "total_spend": total_monthly_spend,
        "budget_limit": monthly_limit,
        "remaining": remaining,
        "spend_pct": spend_pct,
        "alert_level": alert_level,
        "days_in_month": (now - month_start).days + 1,
    }


if len(df_costs) > 0:
    budget_status = check_budget_status(df_costs, config["budget"])

    print("üí∞ Budget Status\n")
    print(f"Monthly Limit: ${budget_status['budget_limit']:.2f}")
    print(f"Current Spend: ${budget_status['total_spend']:.2f} ({budget_status['spend_pct']:.1f}%)")
    print(f"Remaining: ${budget_status['remaining']:.2f}")
    print(f"Days Elapsed: {budget_status['days_in_month']}")

    # Alert status
    alert_emoji = {"OK": "‚úÖ", "WARNING": "‚ö†Ô∏è", "CRITICAL": "üö®"}
    print(f"\nAlert Level: {alert_emoji[budget_status['alert_level']]} {budget_status['alert_level']}")

    if budget_status["alert_level"] == "WARNING":
        print("\n‚ö†Ô∏è  Warning: Approaching budget limit!")
        print(f"   You've used {budget_status['spend_pct']:.1f}% of your monthly budget.")
    elif budget_status["alert_level"] == "CRITICAL":
        print("\nüö® CRITICAL: Budget exceeded!")
        print(f"   You've exceeded your budget by ${-budget_status['remaining']:.2f}.")
else:
    print("‚ö†Ô∏è  No cost data available for budget tracking")

## 7. Cost Optimization Recommendations

In [None]:
def generate_cost_optimization_recommendations(df_costs: pd.DataFrame, config: Dict) -> List[str]:
    """Generate platform-specific cost optimization recommendations."""

    recommendations = []

    for platform in df_costs["platform"].unique():
        platform_data = df_costs[df_costs["platform"] == platform]
        total_cost = platform_data["total_cost"].sum()

        recommendations.append(f"\n**{platform.capitalize()}** (${total_cost:.2f} spent)")

        if platform == "databricks":
            avg_dbu = platform_data["dbu_consumption"].mean()
            recommendations.append(f"  ‚Ä¢ Average DBU consumption: {avg_dbu:.2f} per run")
            recommendations.append("  ‚Ä¢ Consider: Use Photon for up to 3x faster queries (fewer DBUs)")
            recommendations.append("  ‚Ä¢ Consider: Enable auto-termination to avoid idle cluster costs")
            recommendations.append("  ‚Ä¢ Consider: Use Spot instances for non-critical workloads (60-70% savings)")

        elif platform == "bigquery":
            avg_tb = platform_data["tb_processed"].mean() if "tb_processed" in platform_data else 0
            recommendations.append(f"  ‚Ä¢ Average data processed: {avg_tb:.4f} TB per run")
            recommendations.append("  ‚Ä¢ Consider: Partition tables by date to reduce data scanned")
            recommendations.append("  ‚Ä¢ Consider: Use clustering for frequently filtered columns")
            recommendations.append("  ‚Ä¢ Consider: Switch to flat-rate pricing if processing >400TB/month")

        elif platform == "snowflake":
            warehouse_size = (
                platform_data["warehouse_size"].mode()[0] if "warehouse_size" in platform_data else "X-Small"
            )
            avg_credits = platform_data["credits_consumed"].mean() if "credits_consumed" in platform_data else 0
            recommendations.append(f"  ‚Ä¢ Warehouse size: {warehouse_size}")
            recommendations.append(f"  ‚Ä¢ Average credits: {avg_credits:.4f} per run")
            recommendations.append("  ‚Ä¢ Consider: Enable auto-suspend (1 minute idle recommended)")
            recommendations.append("  ‚Ä¢ Consider: Use result caching for repeated queries (free)")
            recommendations.append("  ‚Ä¢ Consider: Right-size warehouse (try smaller size for light workloads)")

        elif platform == "redshift":
            node_type = platform_data["node_type"].mode()[0] if "node_type" in platform_data else "unknown"
            recommendations.append(f"  ‚Ä¢ Node type: {node_type}")
            recommendations.append("  ‚Ä¢ Consider: Use reserved instances for predictable workloads (40-75% savings)")
            recommendations.append("  ‚Ä¢ Consider: Pause clusters when not in use")
            recommendations.append("  ‚Ä¢ Consider: RA3 nodes with managed storage for cost flexibility")

    return recommendations


if len(df_costs) > 0:
    print("üí° Cost Optimization Recommendations\n")
    recommendations = generate_cost_optimization_recommendations(df_costs, config)

    for rec in recommendations:
        print(rec)

    print("\nüéØ General Recommendations:")
    print("  ‚Ä¢ Use local platforms (DuckDB) for development and testing")
    print("  ‚Ä¢ Monitor query performance to identify optimization opportunities")
    print("  ‚Ä¢ Set up automated budget alerts in your cloud provider")
    print("  ‚Ä¢ Review and optimize expensive queries regularly")
    print("  ‚Ä¢ Consider multi-cloud strategy for cost arbitrage")

## 8. Export Cost Reports

In [None]:
# Export comprehensive cost report
if len(df_costs) > 0:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Export detailed costs to CSV
    costs_file = f"{config['output_dir']}/cost_details_{timestamp}.csv"
    df_costs.to_csv(costs_file, index=False)
    print(f"‚úÖ Exported cost details: {costs_file}")

    # 2. Export projections to CSV
    if "df_projections" in locals():
        projections_file = f"{config['output_dir']}/cost_projections_{timestamp}.csv"
        df_projections.to_csv(projections_file, index=False)
        print(f"‚úÖ Exported projections: {projections_file}")

    # 3. Export summary report to JSON
    report = {
        "timestamp": timestamp,
        "platforms": list(df_costs["platform"].unique()),
        "total_runs": len(df_costs),
        "total_cost": float(df_costs["total_cost"].sum()),
        "platform_costs": df_costs.groupby("platform")["total_cost"].sum().to_dict(),
        "budget_status": budget_status if "budget_status" in locals() else None,
        "recommendations": recommendations if "recommendations" in locals() else [],
    }

    report_file = f"{config['output_dir']}/cost_report_{timestamp}.json"
    with open(report_file, "w") as f:
        json.dump(report, f, indent=2, default=str)

    print(f"‚úÖ Exported cost report: {report_file}")
    print(f"\nüìÅ All reports saved to: {config['output_dir']}")
else:
    print("‚ö†Ô∏è  No data to export")

## 9. Summary

In [None]:
if len(df_costs) > 0:
    print("=" * 70)
    print("üí∞ COST ANALYSIS SUMMARY")
    print("=" * 70)

    print(f"\nTotal Benchmark Runs: {len(df_costs)}")
    print(f"Total Cost: ${df_costs['total_cost'].sum():.2f}")
    print(f"Average Cost per Run: ${df_costs['total_cost'].mean():.4f}")

    print("\nüí∞ Platform Costs:")
    for platform, cost in df_costs.groupby("platform")["total_cost"].sum().sort_values(ascending=False).items():
        pct = (cost / df_costs["total_cost"].sum()) * 100
        print(f"  {platform.capitalize()}: ${cost:.2f} ({pct:.1f}%)")

    # Most cost-effective platform
    cost_per_query = (
        df_costs.groupby("platform")
        .apply(lambda x: x["total_cost"].sum() / x["successful_queries"].sum())
        .sort_values()
    )

    print(f"\nüèÜ Most Cost-Effective: {cost_per_query.index[0].capitalize()}")
    print(f"   ${cost_per_query.iloc[0]:.6f} per query")

    if "budget_status" in locals():
        print(f"\nüìä Budget Status: {budget_status['alert_level']}")
        print(f"   {budget_status['spend_pct']:.1f}% of monthly budget used")

    print(f"\nüìÅ Reports exported to: {config['output_dir']}")
    print("\n" + "=" * 70)
else:
    print("‚ö†Ô∏è  No cost data available for summary")