In [None]:
import os
import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

from setup_plot import setup_local, get_colors, get_markers


SET1 = "ldms_set1.parquet"
SET2 = "ldms_set2.parquet"

METRIC_COL = "nersc_ldms_dcgm_gpu_utilization"

JOB_METRICS_OUT = "job_metrics.parquet"
OUT_JOB_ALL     = "job_metrics_ti_all.parquet" 

def ti_expr_count_star(col: str, alias: str) -> str:
    return (
        "CASE "
        f" WHEN MAX({col}) IS NULL OR MAX({col}) <= 0 OR COUNT(*) <= 0 THEN 0.0 "
        " ELSE "
        "   CASE "
        f"     WHEN 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) < 0 THEN 0.0 "
        f"     WHEN 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) > 1 THEN 1.0 "
        f"     ELSE 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) "
        "   END "
        f"END AS {alias}"
    )

def build_job_metrics_mean_utilization():
    con = duckdb.connect()
    con.execute("PRAGMA memory_limit='15GB';")
    con.execute(f"PRAGMA threads={os.cpu_count() or 1};")

    con.execute(f"""
    COPY (
      SELECT
        JobID,
        AVG(gpu_mean) AS mean_utilization
      FROM (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          AVG({METRIC_COL}) AS gpu_mean
        FROM parquet_scan('{SET1}')
        GROUP BY JobID, hostname, gpu_id
      )
      GROUP BY JobID
    )
    TO '{JOB_METRICS_OUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)

    con.close()

def build_job_metrics_ti_all_gpuutil():
    con = duckdb.connect()
    con.execute("PRAGMA memory_limit='15GB';")
    con.execute(f"PRAGMA threads={os.cpu_count() or 1};")

    ti_s1 = f"ti__{METRIC_COL}"
    ti_s2 = f"ti__{METRIC_COL}_s2"

    con.execute(f"""
    COPY (
      WITH
      s1_gpu AS (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          {ti_expr_count_star(METRIC_COL, ti_s1)}
        FROM parquet_scan('{SET1}')
        GROUP BY JobID, hostname, gpu_id
      ),
      s1_job AS (
        SELECT JobID, MAX({ti_s1}) AS {ti_s1}
        FROM s1_gpu
        GROUP BY JobID
      ),
      s2_gpu AS (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          {ti_expr_count_star(METRIC_COL, ti_s2)}
        FROM parquet_scan('{SET2}')
        GROUP BY JobID, hostname, gpu_id
      ),
      s2_job AS (
        SELECT JobID, MAX({ti_s2}) AS {ti_s2}
        FROM s2_gpu
        GROUP BY JobID
      )
      SELECT
        jm.*,
        s1_job.* EXCLUDE (JobID),
        s2_job.* EXCLUDE (JobID)
      FROM parquet_scan('{JOB_METRICS_OUT}') AS jm
      LEFT JOIN s1_job USING (JobID)
      LEFT JOIN s2_job USING (JobID)
    )
    TO '{OUT_JOB_ALL}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)

    con.close()

def pick_ti_col(ti_file: str) -> str:
    pf = pq.ParquetFile(ti_file)
    names = pf.schema_arrow.names
    base = f"ti__{METRIC_COL}"
    s2   = f"ti__{METRIC_COL}_s2"
    if base in names:
        return base
    if s2 in names:
        return s2
    return next(c for c in names if c.startswith("ti__") and "gpu_utilization" in c)

def plot_one_band(df: pd.DataFrame, mask: pd.Series, xlabel: str, legend_label: str):
    sub = df.loc[mask].copy()

    hist_values, bin_edges = np.histogram(sub["temporal_imbalance"], bins=20, density=False)
    cumulative_hist = np.cumsum(hist_values)
    cdf = (cumulative_hist / cumulative_hist[-1] * 100.0) if cumulative_hist[-1] > 0 else cumulative_hist.astype(float)

    setup_local()
    colors, markers = get_colors(), get_markers()

    fig, ax1 = plt.subplots()
    ax1.hist(sub["temporal_imbalance"], bins=20, range=(0,1),
             color=colors[2], edgecolor="black", alpha=0.7, label="Number of Jobs")
    ax1.set_xlabel(xlabel, fontsize=21)
    ax1.set_ylabel("Number of jobs", fontsize=21)
    ax1.set_xlim(0, 1)
    ax1.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax1.set_yticks([0, 2000, 4000, 6000, 8000, 10000])
    ax1.tick_params(axis="x", labelsize=20)
    ax1.tick_params(axis="y", labelsize=20)

    ax2 = ax1.twinx()
    ax2.plot((bin_edges[:-1] + bin_edges[1:]) / 2, cdf,
             color=colors[0], marker=markers[2],
             label=legend_label, linewidth=2, clip_on=False)
    ax2.set_ylabel("Cumulative percentage (%)", fontsize=21)
    ax2.set_ylim(0, 100)
    ax2.tick_params(axis="y", labelsize=20)
    ax2.grid(axis="y", linestyle="--", alpha=0.7)
    ax2.set_yticks([0, 20, 40, 60, 80, 100])

    plt.legend(loc="upper left", fontsize=18)
    plt.tight_layout()
    plt.show()
    return cdf

def plot_three_bands():
    ti_col = pick_ti_col(OUT_JOB_ALL)

    df = pd.read_parquet(OUT_JOB_ALL, columns=["JobID", "mean_utilization", ti_col]).rename(columns={ti_col: "temporal_imbalance"})
    df["mean_utilization"] = pd.to_numeric(df["mean_utilization"], errors="coerce")
    df["temporal_imbalance"] = pd.to_numeric(df["temporal_imbalance"], errors="coerce")
    df = df.dropna(subset=["mean_utilization", "temporal_imbalance"]).copy()

    cdf_0_30 = plot_one_band(
        df,
        (df["mean_utilization"] >= 0) & (df["mean_utilization"] <= 30),
        "Temporal imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 0%–30%)",
        "CDF (number of jobs)",
    )
    cdf_31_69 = plot_one_band(
        df,
        (df["mean_utilization"] > 30) & (df["mean_utilization"] < 70),
        "Temporal imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 31%–69%)",
        "CDF (number of Jobs)",
    )
    cdf_70_100 = plot_one_band(
        df,
        (df["mean_utilization"] >= 70) & (df["mean_utilization"] <= 100),
        "Temporal imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 70%–100%)",
        "CDF (number of jobs)",
    )
    return cdf_0_30, cdf_31_69, cdf_70_100

build_job_metrics_mean_utilization()
build_job_metrics_ti_all_gpuutil()
cdf_0_30, cdf_31_69, cdf_70_100 = plot_three_bands()
cdf_0_30, cdf_31_69, cdf_70_100
