In [None]:
import pandas
import matplotlib.pyplot as plt
import numpy as np
import glob
import json

In [None]:
from scipy.ndimage import uniform_filter1d

In [None]:
hf_path = "/scratch/persistent/joosep/huggingface/particleflow/"

In [None]:
df1 = pandas.read_csv(hf_path + "/clic/clusters/v2.3.0/pyg-clic_20250204_081614_352844/pyg-clic_20250204_081614_352844_runs_train.csv")
df2 = pandas.read_csv(hf_path + "/clic/clusters/v2.3.0/pyg-clic_20250211_145811_219129/pyg-clic_20250211_145811_219129_runs_train.csv")
df3 = pandas.read_csv(hf_path + "/clic/clusters/v2.3.0/pyg-clic_20250130_214007_333962/pyg-clic_20250130_214007_333962_runs_train.csv")

df1["RelTime"] = (df1["Wall time"] - df1["Wall time"][0])/3600
df2["RelTime"] = (df2["Wall time"] - df2["Wall time"][0])/3600
df3["RelTime"] = (df3["Wall time"] - df3["Wall time"][0])/3600

time_window = 20
normval = uniform_filter1d(df3["Value"], time_window)[-1]
df1["Value"] = df1["Value"]/normval
df2["Value"] = df2["Value"]/normval
df3["Value"] = df3["Value"]/normval

In [None]:
plt.figure(figsize=(6,3))
p = plt.plot(df1["RelTime"], df1["Value"], alpha=0.2)
plt.plot(df1["RelTime"], uniform_filter1d(df1["Value"], time_window), color=p[0].get_color(), label="GNNLSH")

p = plt.plot(df2["RelTime"], df2["Value"], alpha=0.2)
plt.plot(df2["RelTime"], uniform_filter1d(df2["Value"], time_window), color=p[0].get_color(), label="full transformer")

p = plt.plot(df3["RelTime"], df3["Value"], alpha=0.2)
plt.plot(df3["RelTime"], uniform_filter1d(df3["Value"], time_window), color=p[0].get_color(), label="full transformer+FlashAttention")

plt.xlim(-1,75)
plt.ylim(0.5, 4)

plt.legend(loc="best", frameon=False)
plt.ylabel("Relative training loss")
plt.xlabel("Training time [hours]")
plt.savefig("step_train_loss.pdf", bbox_inches="tight")

In [None]:
def load_history(path, min_epoch=None, max_epoch=None):
    ret = {}
    for fi in glob.glob(path):
        data = json.load(open(fi))
        data2 = {}
        for k1 in ["train", "valid"]:
            for k2 in data[k1].keys():
                data2[f"{k1}_{k2}"] = data[k1][k2]
        epoch = int(fi.split("_")[-1].split(".")[0])
        ret[epoch] = data2

    if not max_epoch:
        max_epoch = max(ret.keys())
    if not min_epoch:
        min_epoch = min(ret.keys())

    ret2 = []
    for i in range(min_epoch, max_epoch + 1):
        ret2.append(ret[i])
    
    return pandas.DataFrame(ret2)

In [None]:
history = load_history(hf_path + "clic/clusters/v2.2.0/pyg-clic_20250106_193536_269746/history/epoch_*.json", max_epoch=5)
# history = load_history(hf_path + "clic/clusters/v2.3.0/pyg-clic_20250130_214007_333962/history/epoch_*.json", max_epoch=10)

In [None]:
val_reg_loss = np.array(sum(
    [history["valid_Regression_{}".format(l)].values for l in ["energy", "pt", "eta", "sin_phi", "cos_phi"]]
))

In [None]:
plt.figure(figsize=(6,3))
plt.plot(history.index, history["valid_Total"], marker="s", label="total")
plt.plot(history.index, history["valid_Classification_binary"], marker="v", label="primary classification")
plt.plot(history.index, 100*history["valid_Classification"], marker="^", label="PID classification x100")
plt.plot(history.index, 10*val_reg_loss, marker="o", label="regression x10")
plt.legend(loc="best", frameon=False)
plt.ylim(1,3)
plt.ylabel("Validation loss")
plt.xticks(range(0,5), range(1,6))
plt.xlabel("Training epoch")
#plt.yscale("log")
plt.savefig("epoch_valid_loss.pdf", bbox_inches="tight")

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"

import tensorflow as tf
from tensorflow.core.util import event_pb2
import pandas
import matplotlib.pyplot as plt

In [None]:
def tb_to_df(path):
    wall_times = []
    step_losses = []
    for e in tf.data.TFRecordDataset(path):
        data = event_pb2.Event.FromString(e.numpy())
        for val in data.summary.value:
            # print(data.wall_time, val.tag, val.simple_value)
            if val.tag == "epoch/loss_Total":
                wall_times.append(data.wall_time)
                step_losses.append(val.simple_value)

    df = pandas.DataFrame()
    df["wall_time"] = wall_times
    df["step_loss"] = step_losses
    return df

In [None]:
df1 = tb_to_df(hf_path + "/clic/clusters/v2.3.0/pyg-clic_20250130_214007_333962/runs/valid/events.out.tfevents.1738266016.gpu1.local.3836159.1")
df1["wall_time"] -= df1["wall_time"].values[0]

df2 = tb_to_df(hf_path + "/clic/clusters/v2.3.0/largebatch_study_gpus4_notscaledLR0.0001_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250219_055135_172489/runs/valid/events.out.tfevents.1739962531.workergpu047.299220.1")
df2["wall_time"] -= df2["wall_time"].values[0]

df3 = tb_to_df(hf_path + "/clic/clusters/v2.3.0/largebatch_study_gpus4_linearscaledLR0.0004_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250217_082738_406721/runs/valid/events.out.tfevents.1739799057.workergpu041.2328094.1")
df3["wall_time"] -= df3["wall_time"].values[0]

In [None]:
plt.figure(figsize=(6,3))

lowest_val = np.min(df1["step_loss"])
plt.plot(df1["wall_time"]/3600, df1["step_loss"]/lowest_val, label="default")
plt.plot(df2["wall_time"]/3600, df2["step_loss"]/lowest_val, label="GPU x4")
plt.plot(df3["wall_time"]/3600, df3["step_loss"]/lowest_val, label="GPU x4, lr x4")
plt.axhline(1.0, color="black", ls="--", lw=1)

plt.ylim(0.95, 1.25)
plt.legend(loc="best", frameon=False)
plt.xlim(-1,75)
plt.xlabel("Training time [hours]")
plt.ylabel("Relative validation loss")
plt.savefig("largebatch_valid_loss.pdf", bbox_inches="tight")