In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import ceil, log2

from setup_plot import setup_global, setup_local, get_colors, get_markers

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from collections import defaultdict

PARQUET_CLEAN   = "ldms_set1.parquet"
SLURM_CSV       = "slurm.csv"

UTIL_COLUMN     = "nersc_ldms_dcgm_gpu_utilization"
BATCH_SIZE      = 500_000
ROW_GROUP_SIZE  = 500_000

def _norm_step_series_to_alloc(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    bad = (s == "") | s.str.lower().isin(["nan", "none"])
    return s.mask(bad, "alloc").str.lower()

stats = defaultdict(lambda: {
    "total_ts":  0,
    "zero_ts":   0,
    "sum_util":  0.0,
})

dataset = ds.dataset(PARQUET_CLEAN, format="parquet")
scanner = dataset.scanner(
    columns=["JobID", "step", "hostname", "gpu_id", UTIL_COLUMN],
    batch_size=BATCH_SIZE,
    use_threads=True
)

for batch in scanner.to_batches():
    tbl  = pa.Table.from_batches([batch])
    jid  = tbl["JobID"].cast(pa.string()).to_pylist()
    step = tbl["step"].cast(pa.string()).to_pylist()
    host = tbl["hostname"].to_pylist()
    gpu  = tbl["gpu_id"].to_pylist()
    util = tbl[UTIL_COLUMN].to_pylist()

    step = [("alloc" if (x is None or str(x).strip() == "" or str(x).lower() in ("nan", "none"))
             else str(x).strip().lower())
            for x in step]

    for j, s, h, g, u in zip(jid, step, host, gpu, util):
        d = stats[(j, s, h, int(g))]
        d["total_ts"] += 1
        d["zero_ts"]  += int(u == 0)
        d["sum_util"] += float(u)

gpu_df = pd.DataFrame.from_records(
    [(j, s, h, g,
      v["total_ts"], v["zero_ts"], v["sum_util"])
     for (j, s, h, g), v in stats.items()],
    columns=["JobID", "step", "hostname", "gpu_id",
             "total_timestamps", "zero_util_timestamps",
             "total_utilization"]
)
del stats

gpu_df["JobID"] = gpu_df["JobID"].astype(str)
gpu_df["step"]  = _norm_step_series_to_alloc(gpu_df["step"])

use_cols = ["JobID", "step", "Start", "End", "NNodes",
            "Account", "Partition", "SubmitLine"]
slurm = pd.read_csv(SLURM_CSV, usecols=use_cols, low_memory=False)

slurm["JobID"] = slurm["JobID"].astype(str)
slurm["step"]  = _norm_step_series_to_alloc(slurm["step"])

pairs = gpu_df[["JobID", "step"]].drop_duplicates()
slurm = pairs.merge(slurm, on=["JobID", "step"], how="left")

slurm["start_time"] = (
    pd.to_datetime(slurm["Start"], errors="coerce")
      .dt.tz_localize("US/Pacific")
      .dt.tz_convert("UTC")
)
slurm["end_time"] = (
    pd.to_datetime(slurm["End"], errors="coerce")
      .dt.tz_localize("US/Pacific")
      .dt.tz_convert("UTC")
)

slurm_meta = (
    slurm.groupby(["JobID", "step"], as_index=False)
         .agg(
             start_time=("start_time", "min"),
             end_time  =("end_time",   "max"),
             NNodes    =("NNodes",     "max"),
             Account   =("Account",    "first"),
             Partition =("Partition",  "first"),
             SubmitLine=("SubmitLine", "first")
         )
)

gpu_df = gpu_df.merge(slurm_meta, on=["JobID", "step"], how="left", validate="many_to_one")

def agg_job(sub: pd.DataFrame) -> pd.Series:
    per_gpu = (
        sub.groupby(["hostname", "gpu_id"], as_index=False)
           .agg(
               total_ts=("total_timestamps", "sum"),
               sum_util=("total_utilization", "sum"),
               zero_ts =("zero_util_timestamps", "sum"),
           )
    )

    per_gpu = per_gpu[per_gpu["total_ts"] > 0]

    ngpus     = int(len(per_gpu))
    gpu_mean  = (per_gpu["sum_util"] / per_gpu["total_ts"])
    mean_util = float(gpu_mean.mean()) if ngpus > 0 else np.nan
    zero_pct  = float(per_gpu["zero_ts"].sum() / per_gpu["total_ts"].sum()) if per_gpu["total_ts"].sum() > 0 else np.nan

    steps = (
        sub.groupby("step", as_index=False)
           .agg(step_start=("start_time", "min"),
                step_end=("end_time", "max"))
    )
    if not steps.empty:
        steps["step_duration_sec"] = (steps["step_end"] - steps["step_start"]).dt.total_seconds().clip(lower=0)
        gpus_per_step = (
            sub.groupby("step", as_index=False)
               .size()
               .rename(columns={"size": "gpus_in_step"})
        )
        steps = steps.merge(gpus_per_step, on="step", how="left")
        sum_step_duration_sec = float(steps["step_duration_sec"].sum())
        sum_step_gpu_sec      = float((steps["step_duration_sec"] * steps["gpus_in_step"]).sum())
    else:
        sum_step_duration_sec = 0.0
        sum_step_gpu_sec      = 0.0

    alloc_start = sub["start_time"].min()
    alloc_end   = sub["end_time"].max()

    return pd.Series({
        "start_time"             : alloc_start,
        "end_time"               : alloc_end,
        "nnodes"                 : int(sub["hostname"].nunique()),
        "ngpus"                  : ngpus,
        "zero_util_percentage"   : zero_pct,
        "mean_utilization"       : mean_util,
        "Account"                : sub["Account"].iloc[0],
        "Partition"              : sub["Partition"].iloc[0],
        "SubmitLine"             : sub["SubmitLine"].iloc[0],
        "step_duration"          : sum_step_duration_sec,
        "step_duration_hours"    : sum_step_duration_sec / 3600.0,
        "step_gpu_hours"         : sum_step_gpu_sec / 3600.0,
    })

job_df = gpu_df.groupby("JobID", sort=False).apply(agg_job).reset_index()

job_df["duration"]        = (job_df["end_time"] - job_df["start_time"]).dt.total_seconds()
job_df["duration_hours"]  = job_df["duration"] / 3600.0
job_df["gpu_hours"]       = job_df["duration_hours"] * job_df["ngpus"]

viol = job_df.loc[job_df["mean_utilization"] < 1.0, "JobID"]
if not viol.empty:
    raise RuntimeError(
        f"{len(viol)} jobs slipped below 1% utilisation "
    )

job_df = job_df[["JobID","start_time","end_time",
                 "zero_util_percentage","mean_utilization",
                 "nnodes","ngpus",
                 "duration","duration_hours","gpu_hours",
                 "step_duration","step_duration_hours","step_gpu_hours",
                 "Account","Partition","SubmitLine"]]

gputil_all = (
    job_df[["JobID", "ngpus", "duration_hours"]]
      .dropna(subset=["ngpus", "duration_hours"])
      .copy()
)
gputil_all["ngpus"] = gputil_all["ngpus"].astype(int)

ng = gputil_all["ngpus"].to_numpy()
ng = ng[ng > 0]
if ng.size < 2:
    raise ValueError("Need at least two positive ngpus values for a log-x histogram.")

pmax = int(ceil(log2(ng.max())))
top_edge = float(2 ** pmax)
edges = 2.0 ** np.arange(0, pmax + 1)

idx = np.digitize(ng, bins=edges, right=True) - 1
idx = idx[(idx >= 0) & (idx < len(edges) - 1)]
counts = np.bincount(idx, minlength=len(edges) - 1).astype(float)

left   = edges[:-1]
widths = np.diff(edges)
vals   = counts

if vals.size >= 2:
    h_2nd = float(np.partition(vals, -2)[-2])
else:
    h_2nd = float(vals.max()) if vals.size else 1.0

def nice_up(x, base):
    return int(np.ceil(x / base) * base)

step_low = max(50, nice_up(max(1.0, h_2nd / 10), 10))
ylow_max = max(10, nice_up(1.10 * max(h_2nd, 1.0), step_low))

setup_local()
colors = get_colors()

fig, (ax_top, ax_bot) = plt.subplots(2, 1, sharex=True)

bar_kw = dict(color=colors[2], edgecolor="black", align="edge")
ax_top.bar(left, vals, width=widths, **bar_kw)
ax_bot.bar(left, vals, width=widths, **bar_kw)

ax_bot.set_ylim(0, ylow_max)

ax_top.spines["bottom"].set_visible(False)
ax_bot.spines["top"].set_visible(False)
ax_top.tick_params(axis="x", which="both", bottom=False, top=False,
                   labelbottom=False, labeltop=False)

d = 0.5
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
              linestyle="none", color="k", mec="k", mew=1, clip_on=False)
ax_top.plot([0], [0], transform=ax_top.transAxes, **kwargs)
ax_bot.plot([0], [1], transform=ax_bot.transAxes, **kwargs)

ax_top.set_yticks([62000, 64000, 66000])
ax_top.set_ylim(60000, 66000)

for ax in (ax_top, ax_bot):
    ax.set_xscale("log", base=2)
    ax.set_xlim(1, top_edge)

pow2_ticks = edges
ax_bot.set_xticks(pow2_ticks)
ax_bot.set_xticklabels([f"{int(t)}" for t in pow2_ticks],
                       rotation=90, ha="right", va="center",
                       rotation_mode="anchor", fontsize=20)
ax_bot.tick_params(axis="x", pad=2)
ax_bot.set_yticks([0, 2000, 4000, 6000])

for ax in (ax_top, ax_bot):
    ax.yaxis.grid(True, linestyle="--", alpha=0.7)
    ax.tick_params(axis="y", labelsize=20)

fig.supylabel("Number of jobs", fontsize=21, y=0.55, x=0.07)
ax_bot.set_xlabel("Job size (number of GPUs)", fontsize=21, labelpad=10)
plt.suptitle("Distribution of jobs by job size", fontsize=23, y=0.9, x=0.58)
plt.tight_layout()
plt.show()

gputil_all = (
    job_df[["JobID", "ngpus", "duration_hours"]]
      .dropna(subset=["ngpus", "duration_hours"])
      .copy()
)

gputil_all["ngpus"] = gputil_all["ngpus"].astype(int)
gputil_all["duration_hours"] = gputil_all["duration_hours"].astype(float)

bins = [0, 5, 9, 17, 33, 65, 129, 257, 513, 1025, 2049, 4097, 8193]
bin_labels = [f'[{bins[i]}, {bins[i+1]-1}]' for i in range(len(bins) - 1)]
gputil_all["bin"] = np.digitize(gputil_all["ngpus"], bins, right=False) - 1

boxplot_data = [
    gputil_all.loc[gputil_all["bin"] == i, "duration_hours"].dropna()
    for i in range(len(bins) - 1)
]

setup_local()
colors  = get_colors()
markers = get_markers()

fig, ax1 = plt.subplots()

boxplot_stats = ax1.boxplot(
    boxplot_data,
    labels=bin_labels,
    patch_artist=True,
    whis=[5, 95],
    showfliers=True,
    showmeans=True,
    boxprops=dict(facecolor=colors[2]),
    meanprops=dict(marker=markers[1], markerfacecolor=colors[5], markeredgecolor=colors[5]),
)
ax1.set_xlabel('Job size (number of GPUs)', fontsize=21)
ax1.set_ylabel('Job duration (hours)', fontsize=21)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax1.tick_params(axis='x', rotation=90, labelsize=20)
ax1.tick_params(axis='y', labelsize=20)
ax1.set_yscale('log')
ax1.set_yticks([0.01, 0.1, 1, 10, 100])

plt.title("Distribution of duration by job size", fontsize=23, pad=16)
plt.tight_layout()
plt.show()

gputil_all = (
    job_df[["JobID", "ngpus", "mean_utilization"]]
      .dropna(subset=["ngpus"])
      .copy()
)
gputil_all["ngpus"] = gputil_all["ngpus"].astype(int)

colors  = get_colors()
markers = get_markers()

ng = gputil_all["ngpus"].to_numpy()
ng = ng[ng > 0]
if ng.size < 2:
    raise ValueError("Need at least two positive ngpus values for a log-x histogram.")

pmax = int(ceil(log2(ng.max())))
edges = 2.0 ** np.arange(0, pmax + 1)

gputil_all = gputil_all.loc[gputil_all["ngpus"] > 0].copy()
gputil_all["gh_bin"] = np.digitize(gputil_all["ngpus"].to_numpy(), bins=edges, right=True) - 1
gputil_all = gputil_all[(gputil_all["gh_bin"] >= 0) & (gputil_all["gh_bin"] < len(edges) - 1)].copy()

nbins = len(edges) - 1
boxplot_data = [
    gputil_all.loc[gputil_all["gh_bin"] == i, "mean_utilization"].dropna().to_numpy()
    for i in range(nbins)
]

centers = np.sqrt(edges[:-1] * edges[1:])
bin_widths = (edges[1:] - edges[:-1]) * 0.6

positions_nonempty, widths_nonempty, data_nonempty = [], [], []
for i, arr in enumerate(boxplot_data):
    if arr.size > 0:
        positions_nonempty.append(centers[i])
        widths_nonempty.append(bin_widths[i])
        data_nonempty.append(arr)

setup_local()
fig, ax1 = plt.subplots()

ax1.boxplot(
    data_nonempty,
    positions=positions_nonempty,
    widths=widths_nonempty,
    manage_ticks=False,
    patch_artist=True,
    whis=[5, 95],
    showfliers=True,
    showmeans=True,
    boxprops=dict(facecolor=colors[2]),
    meanprops=dict(marker=markers[1], markerfacecolor=colors[5], markeredgecolor=colors[5]),
)

ax1.set_xscale('log', base=2)
ax1.set_xlim(edges[0], edges[-1])
ax1.set_xticks(edges)
ax1.set_xticklabels([f"{int(e)}" for e in edges], rotation=90, ha="right", va="center", rotation_mode="anchor")

ax1.set_xlabel('Job size (number of GPUs)', fontsize=21)
ax1.set_ylabel('Mean (of GPU_UTIL) (%)', fontsize=21)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax1.tick_params(axis='x', rotation=90, labelsize=20)
ax1.tick_params(axis='y', labelsize=20)
ax1.set_ylim(0)
ax1.set_yticks([0, 20, 40, 60, 80, 100])

plt.title("Distribution of GPU_UTIL by job size", fontsize=23, pad=16)
plt.tight_layout()
plt.show()
