In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow.parquet as pq

import pandas as pd
import numpy as np
from collections import defaultdict

PARQUET_CLEAN   = "ldms_set1.parquet"
JOB_METRICS_OUT = "job_metrics.parquet"
UTIL_COLUMN     = "nersc_ldms_dcgm_gpu_utilization"
BATCH_SIZE      = 500_000
ROW_GROUP_SIZE  = 500_000

stats = defaultdict(lambda: {"total_ts": 0, "sum_util": 0.0})

dataset = ds.dataset(PARQUET_CLEAN, format="parquet")
scanner = dataset.scanner(
    columns=["JobID", "hostname", "gpu_id", UTIL_COLUMN],
    batch_size=BATCH_SIZE,
    use_threads=True
)

for rb in scanner.to_batches():
    jid  = pc.cast(rb["JobID"], pa.string()).to_pylist()
    host = rb["hostname"].to_pylist()
    gpu  = pc.cast(rb["gpu_id"], pa.int64()).to_numpy()
    util = pc.cast(rb[UTIL_COLUMN], pa.float64()).to_numpy(zero_copy_only=False)

    for j, h, g, u in zip(jid, host, gpu, util):
        if j is None or np.isnan(u):
            continue
        key = (str(j), h, int(g))
        d = stats[key]
        d["total_ts"] += 1
        d["sum_util"] += float(u)

per_gpu = pd.DataFrame.from_records(
    [(j, h, g, v["total_ts"], v["sum_util"]) for (j, h, g), v in stats.items()],
    columns=["JobID", "hostname", "gpu_id", "total_timestamps", "total_utilization"]
)
del stats

per_gpu = per_gpu[per_gpu["total_timestamps"] > 0].copy()
per_gpu["gpu_mean"] = per_gpu["total_utilization"] / per_gpu["total_timestamps"]

job_df = (
    per_gpu.groupby("JobID", as_index=False)
           .agg(mean_utilization=("gpu_mean", "mean"))
)

pq.write_table(
    pa.Table.from_pandas(job_df, preserve_index=False),
    JOB_METRICS_OUT,
    compression="snappy",
    row_group_size=ROW_GROUP_SIZE,
)
job_df.head()


In [None]:
from pathlib import Path
from dataclasses import dataclass
from collections import defaultdict

import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow.parquet as pq

SET1   = "ldms_set1.parquet"
METRIC = "nersc_ldms_dcgm_gpu_utilization"

SAMPLE_SEC = 10
WINDOW_SEC = 60
assert WINDOW_SEC % SAMPLE_SEC == 0
TICKS_PER_WINDOW = WINDOW_SEC // SAMPLE_SEC

SNS = SAMPLE_SEC * 1_000_000_000
BATCH_SIZE = 500_000

OUT_DIR_WIN_S1 = Path("si_windows_set1_ldms_parts")
OUT_DIR_JOB_S1 = Path("si_jobs_set1_ldms_parts")
OUT_DIR_WIN_S1.mkdir(parents=True, exist_ok=True)
OUT_DIR_JOB_S1.mkdir(parents=True, exist_ok=True)

@dataclass
class JobAnchor:
    ts0: int
    g_ldms: int

def build_job_anchors(parquet_path: str) -> dict:
    ds_in = ds.dataset(parquet_path, format="parquet")
    scanner = ds_in.scanner(
        columns=["JobID", "hostname", "gpu_id", "ts_ns"],
        batch_size=BATCH_SIZE,
        use_threads=True
    )

    ts0_map = {}
    gpu_sets = defaultdict(set)

    for rb in scanner.to_batches():
        jid  = pc.cast(rb["JobID"],  pa.string()).to_pylist()
        host = rb["hostname"].to_pylist()
        gid  = pc.cast(rb["gpu_id"], pa.int64()).to_numpy()
        ts   = pc.cast(rb["ts_ns"],  pa.int64()).to_numpy()

        for j, h, g, t in zip(jid, host, gid, ts):
            if j is None:
                continue
            prev = ts0_map.get(j)
            if prev is None or t < prev:
                ts0_map[j] = int(t)
            gpu_sets[j].add((h, int(g)))

    anchors = {}
    for j, t0 in ts0_map.items():
        gobs = len(gpu_sets[j])
        # drop single-GPU jobs because spatial imbalance is for multi-GPU jobs
        if gobs > 1:  
            anchors[j] = JobAnchor(ts0=int(t0), g_ldms=gobs)

    return anchors

def compute_gpuutil_si(parquet_path: str, anchors: dict) -> None:
    ds_in = ds.dataset(parquet_path, format="parquet")
    cols = ["JobID", "hostname", "gpu_id", "ts_ns", METRIC]
    scanner = ds_in.scanner(columns=cols, batch_size=BATCH_SIZE, use_threads=True)

    tc_per_gpu = defaultdict(float)
    seen_jobs = set()

    for rb in scanner.to_batches():
        jid  = pc.cast(rb["JobID"],  pa.string()).to_pylist()
        host = rb["hostname"].to_pylist()
        gid  = pc.cast(rb["gpu_id"], pa.int64()).to_numpy()
        ts   = pc.cast(rb["ts_ns"],  pa.int64()).to_numpy()
        val  = pc.cast(rb[METRIC],   pa.float64()).to_numpy(zero_copy_only=False)

        for j, h, g, t, v in zip(jid, host, gid, ts, val):
            if j is None or np.isnan(v):
                continue
            a = anchors.get(j)
            if a is None:
                continue

            dt = t - a.ts0
            if dt < 0:
                continue

            k = int((dt + (SNS // 2)) // SNS)
            widx = k // TICKS_PER_WINDOW

            tc_per_gpu[(j, widx, (h, int(g)))] += float(v)
            seen_jobs.add(j)

    if not tc_per_gpu:
        print("no present windows.")
        return

    sum_tc_by_win = defaultdict(float)
    max_tc_by_win = defaultdict(float)

    for (j, widx, _gpu_key), tc in tc_per_gpu.items():
        key = (j, widx)
        sum_tc_by_win[key] += tc
        if tc > max_tc_by_win[key]:
            max_tc_by_win[key] = tc

    win_rows = []
    for (j, widx), sum_tc in sum_tc_by_win.items():
        max_tc = max_tc_by_win[(j, widx)]
        g_ldms = anchors[j].g_ldms

        if max_tc <= 0 or g_ldms <= 0:
            si_w = 0.0
        else:
            si_w = 1.0 - (sum_tc / (max_tc * g_ldms))
            if si_w < 0.0: si_w = 0.0
            if si_w > 1.0: si_w = 1.0

        win_rows.append((j, METRIC, widx, float(sum_tc), float(max_tc), int(g_ldms), float(si_w)))

    win_df = pd.DataFrame(
        win_rows,
        columns=["JobID", "metric", "window_idx", "sum_TC", "max_TC", "g_ldms", "SI_jw"]
    )
    out_win_file = OUT_DIR_WIN_S1 / f"{METRIC}.parquet"
    pq.write_table(pa.Table.from_pandas(win_df, preserve_index=False), out_win_file, compression="snappy")

    w_present_by_job = defaultdict(int)
    si_sum_by_job = defaultdict(float)
    active_any_by_job = defaultdict(bool)

    for _, r in win_df.iterrows():
        j = r["JobID"]
        w_present_by_job[j] += 1
        si_sum_by_job[j] += r["SI_jw"]
        if r["max_TC"] > 0:
            active_any_by_job[j] = True

    job_rows = []
    for j in seen_jobs:
        w_present = w_present_by_job.get(j, 0)
        if w_present == 0:
            continue
        if not active_any_by_job.get(j, False):
            continue
        si_mean = si_sum_by_job[j] / float(w_present)
        job_rows.append((j, METRIC, int(anchors[j].g_ldms), int(w_present), float(si_mean)))

    if not job_rows:
        print("no jobs passed the filters.")
        return

    job_df = pd.DataFrame(job_rows, columns=["JobID", "metric", "g_ldms", "w_present", "SI_mean"])
    out_job_file = OUT_DIR_JOB_S1 / f"{METRIC}.parquet"
    pq.write_table(pa.Table.from_pandas(job_df, preserve_index=False), out_job_file, compression="snappy")

anchors = build_job_anchors(SET1)
if not anchors:
    raise RuntimeError("No multi-GPU jobs found.")
compute_gpuutil_si(SET1, anchors)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from setup_plot import setup_local, get_colors, get_markers

JOB_METRICS = "job_metrics.parquet"
SI_PATH = "si_jobs_set1_ldms_parts/nersc_ldms_dcgm_gpu_utilization.parquet"

def load_si(si_path):
    si = pd.read_parquet(si_path)
    si = si[["JobID", "SI_mean"]].rename(columns={"SI_mean": "spatial_imbalance_gpu_utilization"})
    si["JobID"] = si["JobID"].astype(str)
    return si

def load_job_metrics(job_metrics_path):
    jm = pd.read_parquet(job_metrics_path, columns=["JobID", "mean_utilization"]).copy()
    jm["JobID"] = jm["JobID"].astype(str)
    return jm

def compute_hist_cdf_low(series):
    hist_values, bin_edges = np.histogram(
        series.dropna().clip(0, 1),
        bins=20, range=(0, 1), density=False
    )
    cumulative_hist = np.cumsum(hist_values)
    cdf = cumulative_hist / cumulative_hist[-1] * 100.0 if cumulative_hist[-1] > 0 else cumulative_hist.astype(float)
    return hist_values, bin_edges, cdf

def compute_hist_cdf_mid_high(series):
    hist_values, bin_edges = np.histogram(series.clip(0, 1), bins=20, density=False)
    cumulative_hist = np.cumsum(hist_values)
    cdf = cumulative_hist / cumulative_hist[-1] * 100 if cumulative_hist[-1] > 0 else cumulative_hist.astype(float)
    return hist_values, bin_edges, cdf

def plot_band_0_30(df, bin_edges, cdf):
    setup_local()
    colors = get_colors()
    markers = get_markers()

    fig, ax1 = plt.subplots()
    ax1.hist(df["spatial_imbalance_gpu_utilization"], bins=20, range=(0, 1),
             edgecolor="black", alpha=0.7, color=colors[2])
    ax1.set_xlabel("Spatial imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 0%–30%)", fontsize=21)
    ax1.set_ylabel("Number of jobs", fontsize=21)
    ax1.set_xlim(0)
    ax1.set_ylim(0, 10000)
    ax1.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax1.set_yticks([0, 2000, 4000, 6000, 8000, 10000])
    ax1.tick_params(axis="x", labelsize=20)
    ax1.tick_params(axis="y", labelsize=20)

    ax2 = ax1.twinx()
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.0
    ax2.plot(bin_centers, cdf, color=colors[0], marker=markers[2],
             label="CDF (number of Jobs)", linewidth=2, clip_on=False)
    ax2.set_ylabel("Cumulative percentage (%)", fontsize=21)
    ax2.set_ylim(0, 100)
    ax2.set_yticks([0, 20, 40, 60, 80, 100])
    ax2.tick_params(axis="x", labelsize=20)
    ax2.tick_params(axis="y", labelsize=20)
    ax2.grid(axis="y", linestyle="--", alpha=0.7)

    plt.legend(loc="best", fontsize=18, framealpha=0.5)
    plt.tight_layout()
    plt.show()
    return cdf

def plot_band_31_69(df, bin_edges, cdf):
    setup_local()
    colors, markers = get_colors(), get_markers()

    fig, ax1 = plt.subplots()
    ax1.hist(df["spatial_imbalance_gpu_utilization"], bins=20, range=(0, 1),
             edgecolor="black", alpha=0.7, color=colors[2])
    ax1.set_xlabel(f"GPU_UTIL Temporal Imbalance")
    ax1.set_ylabel("Number of jobs", fontsize=21)
    ax1.set_xlim(0)
    ax1.set_xlabel("Spatial imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 31%–69%)", fontsize=21)
    ax1.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax1.set_yticks([0, 2000, 4000, 6000, 8000, 10000])
    ax1.set_ylim(0, 10000)
    ax1.tick_params(axis="x", labelsize=20)
    ax1.tick_params(axis="y", labelsize=20)
    ax1.grid(axis="y", linestyle="--", alpha=0.7)

    ax2 = ax1.twinx()
    ax2.plot((bin_edges[:-1] + bin_edges[1:]) / 2, cdf, color=colors[0], marker=markers[2],
             label="CDF (number of jobs)", linewidth=2, clip_on=False)
    ax2.set_ylabel("Cumulative percentage (%)", fontsize=21)
    ax2.grid(axis="y", linestyle="--", alpha=0.7)
    ax2.tick_params(axis="x", labelsize=20)
    ax2.tick_params(axis="y", labelsize=20)
    ax2.set_yticks([0, 20, 40, 60, 80, 100])
    ax2.set_ylim(0, 100)

    plt.legend(loc="best", fontsize=18, framealpha=0.5)
    plt.tight_layout()
    plt.show()
    plt.close()
    return cdf

def plot_band_70_100(df, bin_edges, cdf):
    setup_local()
    colors, markers = get_colors(), get_markers()

    fig, ax1 = plt.subplots()
    ax1.hist(df["spatial_imbalance_gpu_utilization"], bins=20, range=(0, 1),
             edgecolor="black", alpha=0.7, color=colors[2])
    ax1.set_xlabel(f"GPU_UTIL Temporal Imbalance")
    ax1.set_ylabel("Number of jobs")
    ax1.set_xlim(0)
    ax1.set_xlabel("Spatial imb. (of GPU_UTIL)\n(mean of GPU_UTIL: 70%–100%)", fontsize=21)
    ax1.set_yticks([0, 2000, 4000, 6000, 8000, 10000])
    ax1.set_ylim(0, 10000)
    ax1.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax1.tick_params(axis="x", labelsize=20)
    ax1.tick_params(axis="y", labelsize=20)
    ax1.grid(axis="y", linestyle="--", alpha=0.7)

    ax2 = ax1.twinx()
    ax2.plot((bin_edges[:-1] + bin_edges[1:]) / 2, cdf, color=colors[0], marker=markers[2],
             label="CDF (Number of Jobs)", linewidth=2, clip_on=False)
    ax2.set_ylabel("Cumulative percentage (%)", fontsize=21)
    ax2.grid(axis="y", linestyle="--", alpha=0.7)
    ax2.tick_params(axis="x", labelsize=20)
    ax2.tick_params(axis="y", labelsize=20)
    ax2.set_yticks([0, 20, 40, 60, 80, 100])
    ax2.set_ylim(0, 100)

    plt.legend(loc="center right", fontsize=18)
    plt.tight_layout()
    plt.show()
    plt.close()
    return cdf

si = load_si(SI_PATH)
jm = load_job_metrics(JOB_METRICS)
gputil_spatial = si.merge(jm, on="JobID", how="inner")

# 0–30
gputil_spatial_30 = gputil_spatial[(gputil_spatial["mean_utilization"] >= 0) &
                                   (gputil_spatial["mean_utilization"] <= 30)].copy()
_, bin_edges_30, cdf_30 = compute_hist_cdf_low(gputil_spatial_30["spatial_imbalance_gpu_utilization"])
cdf_0_30 = plot_band_0_30(gputil_spatial_30, bin_edges_30, cdf_30)

# 31–69
gputil_spatial_30_70 = gputil_spatial.query("30 < mean_utilization < 70").copy()
_, bin_edges_30_70, cdf_30_70 = compute_hist_cdf_mid_high(gputil_spatial_30_70["spatial_imbalance_gpu_utilization"])
cdf_31_69 = plot_band_31_69(gputil_spatial_30_70, bin_edges_30_70, cdf_30_70)

# 70–100
gputil_spatial_70 = gputil_spatial.query("70 <= mean_utilization <= 100").copy()
_, bin_edges_70, cdf_70 = compute_hist_cdf_mid_high(gputil_spatial_70["spatial_imbalance_gpu_utilization"])
cdf_70_100 = plot_band_70_100(gputil_spatial_70, bin_edges_70, cdf_70)

cdf_0_30, cdf_31_69, cdf_70_100
