In [9]:
import os
import wandb
import pandas as pd
from tqdm import tqdm

In [10]:
api = wandb.Api()

In [11]:
entity, project = "compositional-generalization-ut", "experiment-27"
runs = api.runs(entity + "/" + project)

## Light run

In [7]:
# dump all runs to a csv file using run.scan_history()

max_steps = 32_000

summary = {}

for run in tqdm(runs):
    if run.state == "running":
        continue

    # get run config
    run_config = run.config
    run_name = run.name
    task = run_config["data"]["task"]
    n_embd = run_config["model_hparams"]["n_embd"]
    train_size = run_config["data"]["train_size"]
    seed = run_config["training"]["seed"]

    metrics = (
        [
            # "train_loss",
            # "val_loss",
            "trainer/global_step",
        ]
        + [f"val_loss/{ds}" for ds in run_config["data"]["test"]]
        + [f"val_acc/{ds}" for ds in run_config["data"]["test"]]
    )

    summary[run_name] = {
        "name": run_name,
        "task": task,
        "n_embd": n_embd,
        "train_size": train_size,
        "seed": seed,
    }

    metrics_summ_dict = run.summary._json_dict

    # add latest metrics to summary
    for metric in metrics:
        summary[run_name][metric] = metrics_summ_dict[metric]


# make df from summary and save to csv
summary_df = pd.DataFrame(summary).T
# drop index
summary_df.reset_index(drop=True, inplace=True)
summary_df.to_csv("data/exp27/summary_light.csv")

100%|██████████| 226/226 [00:12<00:00, 18.29it/s]


## Full run

In [12]:
# dump all runs to a csv file using run.scan_history()

max_steps = 32_000

summary = {}

for run in tqdm(runs):

    if run.state == "running":
        continue

    # get run config
    run_config = run.config
    run_name = run.name

    # skip if csv for run already exists
    if os.path.isfile(f"data/exp27/{run_name}.csv"):
        continue

    task = run_config["data"]["task"]
    n_embd = run_config["model_hparams"]["n_embd"]
    train_size = run_config["data"]["train_size"]
    seed = run_config["training"]["seed"]

    metrics = (
        [
            # "train_loss",
            # "val_loss",
            "trainer/global_step",
        ]
        + [f"val_loss/{ds}" for ds in run_config["data"]["test"]]
        + [f"val_acc/{ds}" for ds in run_config["data"]["test"]]
    )

    # collect all metrics
    history = {}
    metric_hist = run.scan_history(keys=metrics)
    for metric in metrics:
        history[metric] = [x[metric] for x in metric_hist]

    # make df from history
    df = pd.DataFrame(history)

    # save to csv for run
    df.to_csv(f"data/exp27/{run_name}.csv")

    # limit to max_steps
    df = df[df["trainer/global_step"] <= max_steps]

    summary[run_name] = {
        "name": run_name,
        "task": task,
        "n_embd": n_embd,
        "train_size": train_size,
        "seed": seed,
    }

    # add latest metrics to summary
    for metric in metrics:
        summary[run_name][metric] = df[metric].iloc[-1]


# make df from summary and save to csv
summary_df = pd.DataFrame(summary).T
summary_df.to_csv("data/exp27/summary.csv")

[34m[1mwandb[0m: Network error (ReadTimeout), entering retry loop.
100%|██████████| 226/226 [2:44:40<00:00, 43.72s/it]
