In [22]:
import wandb
import pandas as pd
from collections import defaultdict
import pathlib
from wandb.apis.public import Run as apiRun
import json
import asyncio
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from datetime import datetime

## 1. Fixed variables

In [23]:
api = wandb.Api()

entity = "KowalskiTeam"
project = "Pruning"
path = f"{entity}/{project}"

## 2. Filters and settings
Variables that can be changed to adjust the behavior of the queries.

* job_types - names used to identify the experiments
* only_last_checkpoints - list of schedules that should only be considered the last checkpoint
* aggregation - which columns should be aggregated
* columns_to_drop - columns that should be dropped

It returns only the "pruning_results" runs, which contain the final results of the grouped runs as table artifacts.

In [24]:
# wandb settings
dataset = "cifar100"  # cifar10, cifar100, imagenet1k
job_types = ["General_17-05-2024"]
model = "efficientnetv2s"  # resnet18_cifar, efficientnetv2s, resnet18_imagenet1k

# dataframe settings
only_last_checkpoint = ["iterative", "logarithmic"]
aggregation = {
    "top1_accuracy": ["mean", "std"],
    "top5_accuracy": ["mean", "std"],
    "total_epoch": ["mean", "std"],
}
columns_to_drop = ["repeat", "top5_accuracy"]

Query filters for Wandb data, MongoDB syntax:

In [25]:
query_filters = {
    "config.dataset.name": dataset,
    "config.model": model,
    "state": "finished",
    "jobType": {"$in": job_types},
    "display_name": "pruning_results",
}

## 3. Downloading artifacts

In [26]:
runs = api.runs(
    path,
    filters=query_filters,
)

print(f"Found {len(runs)} runs")

Found 121 runs


In [27]:
executor = ThreadPoolExecutor(max_workers=20)


async def download_artifact(run):
    group = run.group

    # Skip if already downloaded
    if pathlib.Path(f"artifacts/{group}_pruning_results:v0").exists():
        return

    artifact = await loop.run_in_executor(
        executor, api.artifact, f"{run.entity}/{run.project}/{group}_pruning_results:v0"
    )
    await loop.run_in_executor(executor, artifact.download)


# Get an event loop
loop = asyncio.get_event_loop()

# Create tasks for each run
tasks = [download_artifact(run) for run in runs]

# Wait for all tasks to complete
await asyncio.gather(*tasks)
print("All artifacts downloaded")

All artifacts downloaded


## 4. Creating CSV files with aggregated data

In [28]:
dataframes = defaultdict(list)
aggregation = {key: val for key, val in aggregation.items() if key not in columns_to_drop}

for run in runs:
    run: apiRun
    group: str = run.group
    config: dict = run.config
    summary: dict = run.summary

    scheluder_name: str = config["pruning"]["scheduler"]["name"]
    scheduler_end: float = config["pruning"]["scheduler"]["end"]
    iterations: int = summary["iterations"]
    base_top1: float = summary.get("base_top1_accuracy", None)

    if base_top1 is None:
        print(f"Warning: base_top1 is None for '{group}'")

    # open json file, it represents wandb.Table
    with open(f"artifacts/{group}_pruning_results:v0/pruning_results.table.json") as f:
        json_dict = json.load(f)

    # create dataframe from json
    df = pd.DataFrame(json_dict["data"], columns=json_dict["columns"])

    if "total_epoch" not in df.columns:
        df["total_epoch"] = None

    # drop columns
    df = df.drop(columns_to_drop, axis=1, errors="ignore")

    # aggregate and round
    df = df.groupby(["pruned_precent"]).agg(aggregation)
    df = df.round(4)

    # join aggregated columns
    df.columns = df.columns.map("_".join)
    df = df.reset_index()

    # get only last checkpoint for given scheduler
    if scheluder_name in only_last_checkpoint:
        df["difference"] = abs(df["pruned_precent"] - scheduler_end * 100)
        df = df[df["difference"] == df["difference"].min()]
        df = df.drop("difference", axis=1)

    # add run config to dataframe
    normalized_config = pd.json_normalize(config)
    normalized_config.insert(0, "type", normalized_config["pruning.scheduler.name"])

    def set_type(row: pd.Series) -> pd.Series:
        if row["type"] != "manual":
            return row

        pruning_steps = row["pruning.scheduler.pruning_steps"]
        if len(pruning_steps) > 1:
            if pruning_steps[0][0] == pruning_steps[1][0]:
                row["type"] = "manual_constant"
            else:
                row["type"] = "manual_geometric"
        elif len(pruning_steps) == 1:
            row["type"] = "manual_one_shot"

        return row

    normalized_config = normalized_config.apply(set_type, axis=1)

    config_series = normalized_config.squeeze()
    for key, value in config_series.items():
        if isinstance(value, list) or isinstance(value, dict):
            value = json.dumps(value)
        df[key] = value

    # additional columns
    df["group"] = group
    df["iterations"] = iterations
    df["base_top1"] = base_top1

    dataframes[scheluder_name].append(df)

# Get the current date and time
date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Create a directory to store the csv files
path = Path(f"csvs/{dataset}_{date}")
path.mkdir(parents=True, exist_ok=True)

# save to csv
for name in dataframes:
    dataframes[name] = pd.concat(dataframes[name])

    if "type" in dataframes[name].columns:
        type_col = dataframes[name].pop("type")
        dataframes[name].insert(0, "type", type_col)

    # sort by pruned_precent and then top1_accuracy
    dataframes[name] = dataframes[name].sort_values(
        by=["type", "pruned_precent", "top1_accuracy_mean"], ascending=[True, True, False]
    )

    dataframes[name].to_csv(f"{path}/pruning_results_{dataset}_{name}_{date}.csv", index=False)
    print(f"Saved {name} to csv")

Saved constant to csv
Saved iterative to csv
Saved one-shot to csv
