In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import duckdb
from setup_plot import setup_local, get_colors

from pathlib import Path
from dataclasses import dataclass
from collections import defaultdict

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow.parquet as pq

SET1_PARQUET = "ldms_set1.parquet"
NODES_80GB_FILE = "nodes_80gb.txt" 

JOB_MEANS_ALL_OUT = "job_means_all.parquet"
SI_GPU_PATH       = "si_jobs_set1_ldms_parts/nersc_ldms_dcgm_gpu_utilization.parquet"
LABELS_FP64ONLY   = "job_label_fractions_fp64only.parquet"

COUNTERS = [
    "nersc_ldms_dcgm_fp16_active",
    "nersc_ldms_dcgm_fp32_active",
    "nersc_ldms_dcgm_fp64_active",
    "nersc_ldms_dcgm_tensor_active",
]

def avg_expr(cols):
    return ",\n       ".join([f"avg({c}) AS {c}" for c in cols])

con = duckdb.connect()
con.execute("PRAGMA memory_limit='15GB';")
con.execute(f"PRAGMA threads={os.cpu_count()};")

con.execute(f"""
COPY (
    SELECT
        JobID,
        {avg_expr(COUNTERS)}
    FROM (
        SELECT
            JobID::VARCHAR AS JobID,
            hostname,
            gpu_id,
            {avg_expr(COUNTERS)}
        FROM parquet_scan('{SET1_PARQUET}')
        GROUP BY JobID, hostname, gpu_id
    )
    GROUP BY JobID
)
TO '{JOB_MEANS_ALL_OUT}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
""")

con.close()


METRIC = "nersc_ldms_dcgm_gpu_utilization"

SAMPLE_SEC = 10
WINDOW_SEC = 60
assert WINDOW_SEC % SAMPLE_SEC == 0
TICKS_PER_WINDOW = WINDOW_SEC // SAMPLE_SEC
SNS = SAMPLE_SEC * 1_000_000_000

BATCH_SIZE = 500_000

OUT_DIR_WIN = Path("si_windows_set1_ldms_parts")
OUT_DIR_JOB = Path("si_jobs_set1_ldms_parts")
OUT_DIR_WIN.mkdir(parents=True, exist_ok=True)
OUT_DIR_JOB.mkdir(parents=True, exist_ok=True)

@dataclass
class JobAnchor:
    ts0: int
    g_ldms: int

def build_job_anchors(parquet_path: str) -> dict:
    ds_in = ds.dataset(parquet_path, format="parquet")
    scanner = ds_in.scanner(
        columns=["JobID", "hostname", "gpu_id", "ts_ns"],
        batch_size=BATCH_SIZE,
        use_threads=True
    )

    ts0_map = {}
    gpu_sets = defaultdict(set)

    for rb in scanner.to_batches():
        jid  = pc.cast(rb["JobID"], pa.string()).to_pylist()
        host = rb["hostname"].to_pylist()
        gid  = pc.cast(rb["gpu_id"], pa.int64()).to_numpy()
        ts   = pc.cast(rb["ts_ns"], pa.int64()).to_numpy()

        for j, h, g, t in zip(jid, host, gid, ts):
            if j is None:
                continue
            prev = ts0_map.get(j)
            if prev is None or t < prev:
                ts0_map[j] = int(t)
            gpu_sets[j].add((h, int(g)))

    anchors = {}
    for j, t0 in ts0_map.items():
        gobs = len(gpu_sets[j])
        if gobs > 1: 
            anchors[j] = JobAnchor(ts0=int(t0), g_ldms=gobs)

    return anchors

def compute_gpuutil_si(parquet_path: str, anchors: dict) -> None:
    ds_in = ds.dataset(parquet_path, format="parquet")
    cols = ["JobID", "hostname", "gpu_id", "ts_ns", METRIC]
    scanner = ds_in.scanner(columns=cols, batch_size=BATCH_SIZE, use_threads=True)

    tc_per_gpu = defaultdict(float)
    seen_jobs = set()

    for rb in scanner.to_batches():
        jid  = pc.cast(rb["JobID"], pa.string()).to_pylist()
        host = rb["hostname"].to_pylist()
        gid  = pc.cast(rb["gpu_id"], pa.int64()).to_numpy()
        ts   = pc.cast(rb["ts_ns"], pa.int64()).to_numpy()
        val  = pc.cast(rb[METRIC], pa.float64()).to_numpy(zero_copy_only=False)

        for j, h, g, t, v in zip(jid, host, gid, ts, val):
            if j is None or np.isnan(v):
                continue
            a = anchors.get(j)
            if a is None:
                continue

            dt = t - a.ts0
            if dt < 0:
                continue

            k = int((dt + (SNS // 2)) // SNS)
            widx = k // TICKS_PER_WINDOW

            tc_per_gpu[(j, widx, (h, int(g)))] += float(v)
            seen_jobs.add(j)

    if not tc_per_gpu:
        raise RuntimeError("No present windows for GPU_UTIL.")

    sum_tc_by_win = defaultdict(float)
    max_tc_by_win = defaultdict(float)

    for (j, widx, _gpu_key), tc in tc_per_gpu.items():
        key = (j, widx)
        sum_tc_by_win[key] += tc
        if tc > max_tc_by_win[key]:
            max_tc_by_win[key] = tc

    win_rows = []
    for (j, widx), sum_tc in sum_tc_by_win.items():
        max_tc = max_tc_by_win[(j, widx)]
        g_ldms = anchors[j].g_ldms

        if max_tc <= 0 or g_ldms <= 0:
            si_w = 0.0
        else:
            si_w = 1.0 - (sum_tc / (max_tc * g_ldms))
            if si_w < 0.0: si_w = 0.0
            if si_w > 1.0: si_w = 1.0

        win_rows.append((j, METRIC, widx, float(sum_tc), float(max_tc), int(g_ldms), float(si_w)))

    win_df = pd.DataFrame(
        win_rows,
        columns=["JobID", "metric", "window_idx", "sum_TC", "max_TC", "g_ldms", "SI_jw"]
    )
    pq.write_table(pa.Table.from_pandas(win_df, preserve_index=False),
                   (OUT_DIR_WIN / f"{METRIC}.parquet").as_posix(),
                   compression="snappy")

    w_present_by_job = defaultdict(int)
    si_sum_by_job = defaultdict(float)
    active_any_by_job = defaultdict(bool)

    for _, r in win_df.iterrows():
        j = r["JobID"]
        w_present_by_job[j] += 1
        si_sum_by_job[j] += r["SI_jw"]
        if r["max_TC"] > 0:
            active_any_by_job[j] = True

    job_rows = []
    for j in seen_jobs:
        w_present = w_present_by_job.get(j, 0)
        if w_present == 0:
            continue
        if not active_any_by_job.get(j, False):
            continue
        si_mean = si_sum_by_job[j] / float(w_present)
        job_rows.append((j, METRIC, int(anchors[j].g_ldms), int(w_present), float(si_mean)))

    if not job_rows:
        raise RuntimeError("No jobs passed SI filters.")

    job_df = pd.DataFrame(job_rows, columns=["JobID", "metric", "g_ldms", "w_present", "SI_mean"])
    pq.write_table(pa.Table.from_pandas(job_df, preserve_index=False),
                   (OUT_DIR_JOB / f"{METRIC}.parquet").as_posix(),
                   compression="snappy")

anchors = build_job_anchors(SET1_PARQUET)
if not anchors:
    raise RuntimeError("No multi-GPU jobs found in set1.")
compute_gpuutil_si(SET1_PARQUET, anchors)


DCGM_PREFIX = "nersc_ldms_dcgm_"
ACTIVE_COLS = [
    f"{DCGM_PREFIX}fp16_active",
    f"{DCGM_PREFIX}fp32_active",
    f"{DCGM_PREFIX}fp64_active",
    f"{DCGM_PREFIX}tensor_active",
    f"{DCGM_PREFIX}dram_active",
]

PEAK_FLOPS_FP64_VECTOR = 9.7e12
HBM_40 = 1.555e12
HBM_80 = 2.039e12

LABEL_BATCH = 1_000_000

def load_nodes_80gb(path: str) -> set[str]:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Missing {p.resolve()}")
    nodes = set()
    for ln in p.read_text(encoding="utf-8", errors="replace").splitlines():
        ln = ln.strip()
        if not ln or ln.startswith("#"):
            continue
        for tok in ln.replace(",", " ").split():
            tok = tok.strip()
            if tok and not tok.startswith("#"):
                nodes.add(tok)
    return nodes

nodes80 = load_nodes_80gb(NODES_80GB_FILE)

need_cols = ["JobID", "hostname", "gpu_id", "ts_ns"] + ACTIVE_COLS
pf = pq.ParquetFile(SET1_PARQUET)
present = set(pf.schema_arrow.names)
missing = [c for c in need_cols if c not in present]
if missing:
    raise KeyError(f"Missing required columns in {SET1_PARQUET}: {missing}")

state = {} 
job_time = defaultdict(lambda: {"Compute-intensive": 0, "Memory-intensive": 0, "Other": 0})
job_total = defaultdict(int)
job_samples = defaultdict(int)

for batch in pf.iter_batches(columns=need_cols, batch_size=LABEL_BATCH):
    df = batch.to_pandas()
    if df.empty:
        continue

    job = df["JobID"].astype(str)
    host = df["hostname"].astype(str)
    gpu = pd.to_numeric(df["gpu_id"], errors="coerce")
    ts  = pd.to_numeric(df["ts_ns"], errors="coerce")

    fp16 = pd.to_numeric(df[f"{DCGM_PREFIX}fp16_active"], errors="coerce")
    fp32 = pd.to_numeric(df[f"{DCGM_PREFIX}fp32_active"], errors="coerce")
    fp64 = pd.to_numeric(df[f"{DCGM_PREFIX}fp64_active"], errors="coerce")
    tens = pd.to_numeric(df[f"{DCGM_PREFIX}tensor_active"], errors="coerce")
    dram = pd.to_numeric(df[f"{DCGM_PREFIX}dram_active"], errors="coerce")

    missing_any = fp16.isna() | fp32.isna() | fp64.isna() | tens.isna() | dram.isna()
    bad_gt1 = (fp16 > 1.0) | (fp32 > 1.0) | (fp64 > 1.0) | (tens > 1.0) | (dram > 1.0)
    all_fp_zero = (fp16.eq(0.0)) & (fp32.eq(0.0)) & (fp64.eq(0.0)) & (tens.eq(0.0))

    keep = ~(bad_gt1 | all_fp_zero)
    if not keep.any():
        continue

    job = job.loc[keep].to_numpy()
    host = host.loc[keep].to_numpy()
    gpu = gpu.loc[keep].to_numpy()
    ts  = ts.loc[keep].to_numpy()

    fp16 = fp16.loc[keep].to_numpy(dtype=float, copy=False)
    fp32 = fp32.loc[keep].to_numpy(dtype=float, copy=False)
    fp64 = fp64.loc[keep].to_numpy(dtype=float, copy=False)
    tens = tens.loc[keep].to_numpy(dtype=float, copy=False)
    dram = dram.loc[keep].to_numpy(dtype=float, copy=False)
    missing_any = missing_any.loc[keep].to_numpy(dtype=bool, copy=False)

    is80 = np.fromiter((h in nodes80 for h in host), dtype=bool, count=len(host))
    peak_hbm = np.where(is80, HBM_80, HBM_40)
    achieved_hbm = dram * peak_hbm

    ridge_fp64 = PEAK_FLOPS_FP64_VECTOR / peak_hbm
    achieved_fp64 = fp64 * PEAK_FLOPS_FP64_VECTOR

    ai_fp64 = np.full(len(job), np.nan, dtype=float)
    np.divide(achieved_fp64, achieved_hbm, out=ai_fp64, where=(achieved_hbm > 0))

    any_compute = (fp16 > 0) | (fp32 > 0) | (fp64 > 0) | (tens > 0)
    eligible = ~missing_any

    labels = np.full(len(job), "Other", dtype=object)

    mask_inf = eligible & any_compute & (achieved_hbm <= 0)
    labels[mask_inf] = "Compute-intensive"

    mask_pos = eligible & (achieved_hbm > 0) & np.isfinite(ai_fp64) & np.isfinite(ridge_fp64)
    mem = mask_pos & (ai_fp64 < ridge_fp64)
    comp = mask_pos & ~mem
    labels[mem] = "Memory-intensive"
    labels[comp] = "Compute-intensive"

    for j, h, g, t, lab in zip(job, host, gpu, ts, labels):
        job_samples[j] += 1
        if not np.isfinite(g) or not np.isfinite(t):
            continue
        key = (j, h, int(g))
        t = int(t)

        if key in state:
            last_ts, last_lab = state[key]
            dt = t - last_ts
            if dt > 0:
                job_time[j][last_lab] += dt
                job_total[j] += dt

        state[key] = (t, lab)

rows = []
for j, tot in job_total.items():
    if tot <= 0:
        continue
    c = job_time[j]["Compute-intensive"]
    m = job_time[j]["Memory-intensive"]
    o = job_time[j]["Other"]
    rows.append({
        "JobID": j,
        "time_seconds": float(tot) / 1e9,
        "frac_time_compute_fp64only": c / tot,
        "frac_time_memory_fp64only":  m / tot,
        "frac_time_other_fp64only":   o / tot,
        "sample_count": int(job_samples.get(j, 0)),
    })

out = pd.DataFrame(rows).sort_values("JobID")
out.to_parquet(LABELS_FP64ONLY, index=False)

MEANS = "job_means_all.parquet"
SI_GPU = "si_jobs_set1_ldms_parts/nersc_ldms_dcgm_gpu_utilization.parquet"
LABELS_FP64ONLY = "job_label_fractions_fp64only.parquet"

THR = 0.005

cols_means = [
    "JobID",
    "nersc_ldms_dcgm_fp16_active",
    "nersc_ldms_dcgm_fp32_active",
    "nersc_ldms_dcgm_fp64_active",
    "nersc_ldms_dcgm_tensor_active",
]
dfm = pd.read_parquet(MEANS, columns=cols_means).rename(columns={"JobID": "jobid"})
dfm["fp16_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp16_active"],   errors="coerce") > THR
dfm["fp32_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp32_active"],   errors="coerce") > THR
dfm["fp64_active"]   = pd.to_numeric(dfm["nersc_ldms_dcgm_fp64_active"],   errors="coerce") > THR
dfm["tensor_active"] = pd.to_numeric(dfm["nersc_ldms_dcgm_tensor_active"], errors="coerce") > THR

df_si = pd.read_parquet(SI_GPU, columns=["JobID","SI_mean"]).rename(columns={"JobID":"jobid","SI_mean":"spatial_imbalance"})
left_base = dfm.merge(df_si, on="jobid", how="inner")
left_base = left_base.dropna(subset=["spatial_imbalance"])

c_fp32_only = left_base[(~left_base["fp16_active"]) & ( left_base["fp32_active"]) & (~left_base["fp64_active"]) & (~left_base["tensor_active"])].copy()
c_fp32_tnsr = left_base[(~left_base["fp16_active"]) & ( left_base["fp32_active"]) & (~left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()
c_fp64_only = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) & ( left_base["fp64_active"]) & (~left_base["tensor_active"])].copy()
c_fp64_tnsr = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) & ( left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()
c_tnsr_only = left_base[(~left_base["fp16_active"]) & (~left_base["fp32_active"]) & (~left_base["fp64_active"]) & ( left_base["tensor_active"])].copy()

for dfc, name in [
    (c_fp32_only, "Only FP32"),
    (c_fp32_tnsr, "FP32+Tensor"),
    (c_fp64_only, "Only FP64"),
    (c_fp64_tnsr, "FP64+Tensor"),
    (c_tnsr_only, "Only Tensor"),
]:
    dfc["category"] = name

combined_left = pd.concat([c_fp32_only, c_fp32_tnsr, c_fp64_only, c_fp64_tnsr, c_tnsr_only], ignore_index=True)
order_left = ["Only FP32","FP32+Tensor","Only FP64","FP64+Tensor","Only Tensor"]

lab = pd.read_parquet(LABELS_FP64ONLY, columns=["JobID","frac_time_compute_fp64only","frac_time_memory_fp64only"]) \
        .rename(columns={"JobID":"JobID"})
lab["class"] = np.where(lab["frac_time_compute_fp64only"] > lab["frac_time_memory_fp64only"], "Compute-bound",
                 np.where(lab["frac_time_memory_fp64only"] > lab["frac_time_compute_fp64only"], "Memory-bound", "Other"))
lab = lab[lab["class"].isin(["Compute-bound","Memory-bound"])].copy()

df_si_right = pd.read_parquet(SI_GPU, columns=["JobID","SI_mean"]).rename(columns={"JobID":"JobID","SI_mean":"si_gpuutil"})
df_right = lab.merge(df_si_right, on="JobID", how="inner")
df_right["si_gpuutil"] = pd.to_numeric(df_right["si_gpuutil"], errors="coerce")
df_right = df_right[np.isfinite(df_right["si_gpuutil"])].copy()

setup_local()
colors = get_colors()

fig, (ax_left, ax_right) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), sharey=False)

sns.violinplot(
    ax=ax_left,
    x="category", y="spatial_imbalance",
    data=combined_left, order=order_left, common_norm=True,
    density_norm="area",
    inner="quartile", cut=0, linewidth=1, palette=[colors[2]]
)
for ln in ax_left.lines:
    ln.set_color("white"); ln.set_linewidth(2)
ax_left.set_ylabel("Spatial imb.\n(of GPU_UTIL)", fontsize=23)
ax_left.set_ylim(0, 1.0)
ax_left.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
ax_left.tick_params(axis='y', labelsize=22)
ax_left.set_xlabel("")
ax_left.tick_params(axis='x', labelsize=20, rotation=90)
ax_left.set_title("Spatial imb. (of GPU_UTIL)\nby FP pipe combinations", fontsize=24, pad=16, x=0.45)
ax_left.grid(axis='y', linestyle='--', alpha=0.7)

sns.violinplot(
    ax=ax_right,
    x="class", y="si_gpuutil", data=df_right,
    order=["Compute-bound","Memory-bound"],
    common_norm=True, inner="quartile", cut=0, linewidth=1,
    palette=[colors[4], colors[5]]
)
for ln in ax_right.lines:
    ln.set_color("white"); ln.set_linewidth(2)
ax_right.tick_params(axis='y', labelsize=22)
ax_right.tick_params(axis='x', labelsize=22)
ax_right.set_ylim(0, 1.0)
ax_right.set_ylabel("Spatial imb.\n(of GPU_UTIL)", fontsize=23)
ax_right.set_xlabel("")
ax_right.set_title("Spatial imb. (of GPU_UTIL)\n(Compute- vs Memory-bound)", fontsize=24, pad=16, x=0.50)
ax_right.grid(axis="y", linestyle="--", alpha=0.7)
ax_right.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])

plt.tight_layout()
plt.show()
