In [None]:
import os
import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
from collections import defaultdict
from pathlib import Path

from setup_plot import setup_local, get_colors

SET1 = "ldms_set1.parquet"
SET2 = "ldms_set2.parquet"
NODES_80GB_FILE = "nodes_80gb.txt"

JOB_MEANS_ALL_OUT = "job_means_all.parquet"
JOB_METRICS_OUT   = "job_metrics.parquet"
JOB_METRICS_TI    = "job_metrics_ti_all.parquet"
LABELS_FP64ONLY   = "job_label_fractions_fp64only.parquet"

METRIC_COL = "nersc_ldms_dcgm_gpu_utilization"
THR = 0.005

DCGM_PREFIX = "nersc_ldms_dcgm_"
ACTIVE_COLS = [
    f"{DCGM_PREFIX}fp16_active",
    f"{DCGM_PREFIX}fp32_active",
    f"{DCGM_PREFIX}fp64_active",
    f"{DCGM_PREFIX}tensor_active",
    f"{DCGM_PREFIX}dram_active",
]
PEAK_FLOPS_FP64_VECTOR = 9.7e12
HBM_40 = 1.555e12
HBM_80 = 2.039e12
LABEL_BATCH = 1_000_000

def build_job_means_all():
    counters = [
        "nersc_ldms_dcgm_fp16_active",
        "nersc_ldms_dcgm_fp32_active",
        "nersc_ldms_dcgm_fp64_active",
        "nersc_ldms_dcgm_tensor_active",
    ]

    def avg_expr(cols):
        return ",\n       ".join([f"avg({c}) AS {c}" for c in cols])

    con = duckdb.connect()
    con.execute("PRAGMA memory_limit='15GB';")
    con.execute(f"PRAGMA threads={os.cpu_count() or 1};")

    con.execute(f"""
    COPY (
        SELECT
            JobID,
            {avg_expr(counters)}
        FROM (
            SELECT
                JobID::VARCHAR AS JobID,
                hostname,
                gpu_id,
                {avg_expr(counters)}
            FROM parquet_scan('{SET1}')
            GROUP BY JobID, hostname, gpu_id
        )
        GROUP BY JobID
    )
    TO '{JOB_MEANS_ALL_OUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)

    con.close()

def ti_expr_count_star(col: str, alias: str) -> str:
    return (
        "CASE "
        f" WHEN MAX({col}) IS NULL OR MAX({col}) <= 0 OR COUNT(*) <= 0 THEN 0.0 "
        " ELSE "
        "   CASE "
        f"     WHEN 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) < 0 THEN 0.0 "
        f"     WHEN 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) > 1 THEN 1.0 "
        f"     ELSE 1.0 - SUM({col}) / (COUNT(*) * MAX({col})) "
        "   END "
        f"END AS {alias}"
    )

def build_job_metrics_and_ti():
    con = duckdb.connect()
    con.execute("PRAGMA memory_limit='15GB';")
    con.execute(f"PRAGMA threads={os.cpu_count() or 1};")

    con.execute(f"""
    COPY (
      SELECT
        JobID,
        AVG(gpu_mean) AS mean_utilization
      FROM (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          AVG({METRIC_COL}) AS gpu_mean
        FROM parquet_scan('{SET1}')
        GROUP BY JobID, hostname, gpu_id
      )
      GROUP BY JobID
    )
    TO '{JOB_METRICS_OUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)
    con.close()

    ti_s1 = f"ti__{METRIC_COL}"
    ti_s2 = f"ti__{METRIC_COL}_s2"

    con = duckdb.connect()
    con.execute("PRAGMA memory_limit='15GB';")
    con.execute(f"PRAGMA threads={os.cpu_count() or 1};")

    con.execute(f"""
    COPY (
      WITH
      s1_gpu AS (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          {ti_expr_count_star(METRIC_COL, ti_s1)}
        FROM parquet_scan('{SET1}')
        GROUP BY JobID, hostname, gpu_id
      ),
      s1_job AS (
        SELECT JobID, MAX({ti_s1}) AS {ti_s1}
        FROM s1_gpu
        GROUP BY JobID
      ),
      s2_gpu AS (
        SELECT
          JobID::VARCHAR AS JobID,
          hostname,
          gpu_id,
          {ti_expr_count_star(METRIC_COL, ti_s2)}
        FROM parquet_scan('{SET2}')
        GROUP BY JobID, hostname, gpu_id
      ),
      s2_job AS (
        SELECT JobID, MAX({ti_s2}) AS {ti_s2}
        FROM s2_gpu
        GROUP BY JobID
      )
      SELECT
        jm.*,
        s1_job.* EXCLUDE (JobID),
        s2_job.* EXCLUDE (JobID)
      FROM parquet_scan('{JOB_METRICS_OUT}') AS jm
      LEFT JOIN s1_job USING (JobID)
      LEFT JOIN s2_job USING (JobID)
    )
    TO '{JOB_METRICS_TI}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
    """)

    con.close()

def load_nodes_80gb(path: str) -> set[str]:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Missing {p.resolve()}")
    nodes = set()
    for ln in p.read_text(encoding="utf-8", errors="replace").splitlines():
        ln = ln.strip()
        if not ln or ln.startswith("#"):
            continue
        for tok in ln.replace(",", " ").split():
            tok = tok.strip()
            if tok and not tok.startswith("#"):
                nodes.add(tok)
    return nodes

def build_fp64only_labels():
    nodes80 = load_nodes_80gb(NODES_80GB_FILE)

    need_cols = ["JobID", "hostname", "gpu_id", "ts_ns"] + ACTIVE_COLS
    pf = pq.ParquetFile(SET1)
    present = set(pf.schema_arrow.names)
    missing = [c for c in need_cols if c not in present]
    if missing:
        raise KeyError(f"Missing required columns in {SET1}: {missing}")

    state = {}
    job_time = defaultdict(lambda: {"Compute-intensive": 0, "Memory-intensive": 0, "Other": 0})
    job_total = defaultdict(int)
    job_samples = defaultdict(int)

    for batch in pf.iter_batches(columns=need_cols, batch_size=LABEL_BATCH):
        df = batch.to_pandas()
        if df.empty:
            continue

        job = df["JobID"].astype(str)
        host = df["hostname"].astype(str)
        gpu = pd.to_numeric(df["gpu_id"], errors="coerce")
        ts  = pd.to_numeric(df["ts_ns"], errors="coerce")

        fp16 = pd.to_numeric(df[f"{DCGM_PREFIX}fp16_active"], errors="coerce")
        fp32 = pd.to_numeric(df[f"{DCGM_PREFIX}fp32_active"], errors="coerce")
        fp64 = pd.to_numeric(df[f"{DCGM_PREFIX}fp64_active"], errors="coerce")
        tens = pd.to_numeric(df[f"{DCGM_PREFIX}tensor_active"], errors="coerce")
        dram = pd.to_numeric(df[f"{DCGM_PREFIX}dram_active"], errors="coerce")

        missing_any = fp16.isna() | fp32.isna() | fp64.isna() | tens.isna() | dram.isna()
        bad_gt1 = (fp16 > 1.0) | (fp32 > 1.0) | (fp64 > 1.0) | (tens > 1.0) | (dram > 1.0)
        all_fp_zero = (fp16.eq(0.0)) & (fp32.eq(0.0)) & (fp64.eq(0.0)) & (tens.eq(0.0))

        keep = ~(bad_gt1 | all_fp_zero)
        if not keep.any():
            continue

        job = job.loc[keep].to_numpy()
        host = host.loc[keep].to_numpy()
        gpu = gpu.loc[keep].to_numpy()
        ts  = ts.loc[keep].to_numpy()

        fp16 = fp16.loc[keep].to_numpy(dtype=float, copy=False)
        fp32 = fp32.loc[keep].to_numpy(dtype=float, copy=False)
        fp64 = fp64.loc[keep].to_numpy(dtype=float, copy=False)
        tens = tens.loc[keep].to_numpy(dtype=float, copy=False)
        dram = dram.loc[keep].to_numpy(dtype=float, copy=False)
        missing_any = missing_any.loc[keep].to_numpy(dtype=bool, copy=False)

        is80 = np.fromiter((h in nodes80 for h in host), dtype=bool, count=len(host))
        peak_hbm = np.where(is80, HBM_80, HBM_40)
        achieved_hbm = dram * peak_hbm

        ridge_fp64 = PEAK_FLOPS_FP64_VECTOR / peak_hbm
        achieved_fp64 = fp64 * PEAK_FLOPS_FP64_VECTOR

        ai_fp64 = np.full(len(job), np.nan, dtype=float)
        np.divide(achieved_fp64, achieved_hbm, out=ai_fp64, where=(achieved_hbm > 0))

        any_compute = (fp16 > 0) | (fp32 > 0) | (fp64 > 0) | (tens > 0)
        eligible = ~missing_any

        labels = np.full(len(job), "Other", dtype=object)

        mask_inf = eligible & any_compute & (achieved_hbm <= 0)
        labels[mask_inf] = "Compute-intensive"

        mask_pos = eligible & (achieved_hbm > 0) & np.isfinite(ai_fp64) & np.isfinite(ridge_fp64)
        mem = mask_pos & (ai_fp64 < ridge_fp64)
        comp = mask_pos & ~mem
        labels[mem] = "Memory-intensive"
        labels[comp] = "Compute-intensive"

        for j, h, g, t, lab in zip(job, host, gpu, ts, labels):
            job_samples[j] += 1
            if not np.isfinite(g) or not np.isfinite(t):
                continue
            key = (j, h, int(g))
            t = int(t)

            if key in state:
                last_ts, last_lab = state[key]
                dt = t - last_ts
                if dt > 0:
                    job_time[j][last_lab] += dt
                    job_total[j] += dt

            state[key] = (t, lab)

    rows = []
    for j, tot in job_total.items():
        if tot <= 0:
            continue
        c = job_time[j]["Compute-intensive"]
        m = job_time[j]["Memory-intensive"]
        o = job_time[j]["Other"]
        rows.append({
            "JobID": j,
            "time_seconds": float(tot) / 1e9,
            "frac_time_compute_fp64only": c / tot,
            "frac_time_memory_fp64only":  m / tot,
            "frac_time_other_fp64only":   o / tot,
            "sample_count": int(job_samples.get(j, 0)),
        })

    out = pd.DataFrame(rows).sort_values("JobID")
    out.to_parquet(LABELS_FP64ONLY, index=False)

def plot_ti_violins():
    MEANS = JOB_MEANS_ALL_OUT
    TI_GPU = JOB_METRICS_TI
    LABELS = LABELS_FP64ONLY

    # Left panel
    cols_means = [
        "JobID",
        "nersc_ldms_dcgm_fp16_active",
        "nersc_ldms_dcgm_fp32_active",
        "nersc_ldms_dcgm_fp64_active",
        "nersc_ldms_dcgm_tensor_active",
    ]
    dfm = pd.read_parquet(MEANS, columns=cols_means).rename(columns={"JobID": "jobid"})
    dfm["fp16_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp16_active"],   errors="coerce") > THR
    dfm["fp32_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp32_active"],   errors="coerce") > THR
    dfm["fp64_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp64_active"],   errors="coerce") > THR
    dfm["tensor_active"] = pd.to_numeric(dfm["nersc_ldms_dcgm_tensor_active"], errors="coerce") > THR

    dft_gpu = pd.read_parquet(TI_GPU, columns=["JobID", f"ti__{METRIC_COL}"]).rename(
        columns={"JobID": "jobid", f"ti__{METRIC_COL}": "temporal_imbalance"}
    )

    left_base = dfm.merge(dft_gpu, on="jobid", how="inner").dropna(subset=["temporal_imbalance"])

    c_fp32_only = left_base[(~left_base["fp16_active"]) & ( left_base["fp32_active"]) &
                            (~left_base["fp64_active"]) & (~left_base["tensor_active"])].copy()
    c_fp32_tnsr = left_base[(~left_base["fp16_active"]) & ( left_base["fp32_active"]) &
                            (~left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()
    c_fp64_only = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) &
                            ( left_base["fp64_active"]) & (~left_base["tensor_active"])].copy()
    c_fp64_tnsr = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) &
                            ( left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()
    c_tnsr_only = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) &
                            (~left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()

    for dfc, name in [
        (c_fp32_only, "Only FP32"),
        (c_fp32_tnsr, "FP32+Tensor"),
        (c_fp64_only, "Only FP64"),
        (c_fp64_tnsr, "FP64+Tensor"),
        (c_tnsr_only, "Only Tensor"),
    ]:
        dfc["category"] = name

    combined_left = pd.concat([c_fp32_only, c_fp32_tnsr, c_fp64_only, c_fp64_tnsr, c_tnsr_only], ignore_index=True)
    order_left = ["Only FP32", "FP32+Tensor", "Only FP64", "FP64+Tensor", "Only Tensor"]

    # Right panel
    lab = pd.read_parquet(LABELS, columns=["JobID","frac_time_compute_fp64only","frac_time_memory_fp64only"]).copy()
    lab["JobID"] = lab["JobID"].astype(str)
    lab["class"] = np.where(lab["frac_time_compute_fp64only"] > lab["frac_time_memory_fp64only"], "Compute-bound",
                     np.where(lab["frac_time_memory_fp64only"] > lab["frac_time_compute_fp64only"], "Memory-bound", "Other"))
    lab = lab[lab["class"].isin(["Compute-bound", "Memory-bound"])].copy()

    ti_right = pd.read_parquet(TI_GPU, columns=["JobID", f"ti__{METRIC_COL}"]).rename(
        columns={f"ti__{METRIC_COL}": "ti_gputil"}
    )
    ti_right["JobID"] = ti_right["JobID"].astype(str)

    df_right = lab.merge(ti_right, on="JobID", how="inner")
    df_right["ti_gputil"] = pd.to_numeric(df_right["ti_gputil"], errors="coerce")
    df_right = df_right[np.isfinite(df_right["ti_gputil"])].copy()

    # Plot
    setup_local()
    colors = get_colors()
    fig, (ax_left, ax_right) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), sharey=False)

    sns.violinplot(
        ax=ax_left,
        x="category", y="temporal_imbalance",
        data=combined_left, order=order_left,
        common_norm=True, inner="quartile", cut=0, linewidth=1,
        palette=[colors[2]]
    )
    for ln in ax_left.lines:
        ln.set_color("white"); ln.set_linewidth(2)
    ax_left.set_ylabel("Temporal imb.\n(of GPU_UTIL)", fontsize=23)
    ax_left.set_ylim(0, 1.0)
    ax_left.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax_left.tick_params(axis="y", labelsize=22)
    ax_left.set_xlabel("")
    ax_left.tick_params(axis="x", labelsize=20, rotation=90)
    ax_left.set_title("Temporal imb. (of GPU_UTIL)\nby FP pipe combinations", fontsize=24, pad=16, x=0.45)
    ax_left.grid(axis="y", linestyle="--", alpha=0.7)

    sns.violinplot(
        ax=ax_right,
        x="class", y="ti_gputil", data=df_right,
        order=["Compute-bound", "Memory-bound"],
        common_norm=True, inner="quartile", cut=0, linewidth=1,
        palette=[colors[4], colors[5]]
    )
    for ln in ax_right.lines:
        ln.set_color("white"); ln.set_linewidth(2)
    ax_right.tick_params(axis="y", labelsize=22)
    ax_right.tick_params(axis="x", labelsize=22)
    ax_right.set_ylim(0, 1.0)
    ax_right.set_ylabel("Temporal imb.\n(of GPU_UTIL)", fontsize=23)
    ax_right.set_xlabel("")
    ax_right.set_title("Temporal imb. (of GPU_UTIL)\n(Compute- vs Memory-bound)", fontsize=24, pad=16, x=0.50)
    ax_right.grid(axis="y", linestyle="--", alpha=0.7)
    ax_right.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])

    plt.tight_layout()
    plt.show()

build_job_means_all()
build_job_metrics_and_ti()
build_fp64only_labels()
plot_ti_violins()
