In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"

In [None]:
import tensorflow as tf
from tensorflow.core.util import event_pb2
import pandas
import matplotlib.pyplot as plt

In [None]:
from scipy.ndimage import uniform_filter1d

In [None]:
time_window = 1

In [None]:
def tb_to_df(path):
    wall_times = []
    losses = []
    for e in tf.data.TFRecordDataset(path):
        data = event_pb2.Event.FromString(e.numpy())
        for val in data.summary.value:
            # print(data.wall_time, val.tag, val.simple_value)
            if val.tag == "epoch/loss_Total":
                wall_times.append(data.wall_time)
                losses.append(val.simple_value)

    df = pandas.DataFrame()
    df["wall_time"] = wall_times
    df["loss"] = losses
    return df

In [None]:
df1 = tb_to_df("/scratch/persistent/joosep/huggingface/particleflow/clic/clusters/v2.3.0/pyg-clic_20250130_214007_333962/runs/valid/events.out.tfevents.1738266016.gpu1.local.3836159.1")
df1["wall_time"] -= df1["wall_time"].values[0]

In [None]:
df2 = tb_to_df("/scratch/persistent/joosep/huggingface/particleflow/clic/clusters/v2.3.0/largebatch_study_gpus4_notscaledLR0.0001_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250219_055135_172489/runs/valid/events.out.tfevents.1739962531.workergpu047.299220.1")
df2["wall_time"] -= df2["wall_time"].values[0]

In [None]:
df3 = tb_to_df("/scratch/persistent/joosep/huggingface/particleflow/clic/clusters/v2.3.0/largebatch_study_gpus4_linearscaledLR0.0004_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250217_082738_406721/runs/valid/events.out.tfevents.1739799057.workergpu041.2328094.1")
df3["wall_time"] -= df3["wall_time"].values[0]

In [None]:
df4 = tb_to_df("/scratch/persistent/joosep/huggingface/particleflow/clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/valid/events.out.tfevents.1741957152.workergpu072.934288.1")
df4["wall_time"] -= df4["wall_time"].values[0]

In [None]:
best_loss = df1["loss"].min()

In [None]:
plt.figure(figsize=(6,3))
plt.plot(df1["wall_time"]/3600, uniform_filter1d(df1["loss"]/best_loss, time_window), label="default")
plt.plot(df2["wall_time"]/3600, uniform_filter1d(df2["loss"]/best_loss, time_window), label="4 GPUs")
plt.plot(df3["wall_time"]/3600, uniform_filter1d(df3["loss"]/best_loss, time_window), label="4 GPUs, learning rate x4")
plt.plot(df4["wall_time"]/3600, uniform_filter1d(df4["loss"]/best_loss, time_window), label="4 GPUs, learning rate x4, weight decay x3")

plt.ylim(0.9, 1.4)
plt.legend(loc="best", frameon=False)
plt.axhline(1.0, color="black", ls="--", lw=0.5)
plt.xlabel("Training time [hours]")
plt.ylabel("Relative validation loss")
plt.savefig("loss_largebatch.pdf", bbox_inches="tight")