# Report and plots `fig:mnist-like__trade-off`

In [None]:
import os
import re
import tqdm
import copy

import torch
import numpy as np
import pandas as pd

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import warnings

warnings.simplefilter("ignore")

File-based caching, and snapshot loading.

In [None]:
from cplxpaper.auto import auto

from cplxpaper.auto.utils import file_cache
from cplxpaper.auto.utils import load_stage_snapshot

Experiment manifest loader and completion checker.

In [None]:
from cplxpaper.auto.utils import load_manifest, verify_experiment

Fetch any one of the given keys from a dict, prioritizing from left to right

In [None]:
def dict_get_one(d, *keys):
    for k in keys:
        if k in d:
            return d[k]

    raise KeyError

Select a device

In [None]:
device_ = torch.device("cpu")

Load the model, stored in the given snapshot.

In [None]:
def load_model(snapshot, errors="ignore"):
    """Recover the model from the snapshot."""
    if errors not in ("ignore", "raise"):
        raise ValueError(f"`errors` must be either 'ignore' or 'raise'.")

    if any(k not in snapshot for k in ["options", "stage", "model"]):
        if errors == "raise":
            raise ValueError("Bad snapshot.")
        return torch.nn.Module()

    options = snapshot["options"]
    _, settings = snapshot["stage"]

    model = auto.get_model(options["model"], **settings["model"])
    model.to(device=torch.device("cpu"))
    model.load_state_dict(snapshot["model"])

    return model

Load models from an experiment:
* load the models at the end of `dense` and `fine-tune` stages
* recover the model that existed just before the `fine-tune` stage
from `sparsify` and the sparsity threshold, specified in the experiment.

In [None]:
def load_experiment(folder):
    options = load_manifest(folder)

    # load 'dense'
    models = {"dense": load_model(load_stage_snapshot("dense", folder))}

    # "post-fine-tune"
    snapshot = load_stage_snapshot("fine-tune", folder)
    models["post-fine-tune"] = load_model(snapshot)

    # "pre-fine-tune": load model from `fine-tune` and deploy the masks
    #  and weights onto it from `sparsify` using the prescribed threshold.
    state_dict, masks = auto.state_dict_with_masks(
        load_model(load_stage_snapshot("sparsify", folder)),
        hard=True, threshold=options["threshold"])

    models["pre-fine-tune"] = load_model(snapshot)
    models["pre-fine-tune"].load_state_dict(state_dict, strict=False)

    return options, models

Evaluate every experiment from the grid of experiments.
* calls `evaluate_experiemnt(...)` defined below.

In [None]:
from cplxpaper.auto.parameter_grid import reconstruct_grid

def evaluate_grid(grid):
    grid = os.path.abspath(os.path.normpath(grid))
    grid, _, filenames = next(os.walk(grid))

    filenames = tqdm.tqdm(filenames, desc="analyzing grid")
    
    results = []
    for name, ext in map(os.path.splitext, filenames):
        if ext != ".json" or name.startswith("."):
            continue

        experiment = os.path.join(grid, name)
        if not verify_experiment(experiment):
            continue

        results.append((experiment, *evaluate_experiment(experiment)))
    
    if not results:
        return {}, []

    # compute the grid and flatten the manifests
    experiments, options, *results = zip(*results)
    full_grid, flat_options = reconstruct_grid(options)

    return full_grid, [*zip(experiments, flat_options, *results)]

Decide on the target folder and computation cache.

In [None]:
report_name = "figure__mnist-like__trade-off"

report_target = os.path.normpath(os.path.abspath(os.path.join(
    "../../assets", report_name
)))

os.makedirs(report_target, exist_ok=True)

<br>

## Report-specific procedures

A dirty hack to avoid loading the same dataset over and over.

In [None]:
import pickle
from functools import lru_cache

@lru_cache(None)
def _get_datasets(key):
    return auto.get_datasets(pickle.loads(key))


def get_datasets(datasets):
    return _get_datasets(pickle.dumps(datasets))

Create the test feed for later evaluation.

In [None]:
from cplxpaper.mnist.performance import MNISTBasePerformance

def get_scorer(options, **kwargs):
    feeds = auto.get_feeds(
        get_datasets(options["datasets"]),
        kwargs, options["features"],
        {"test": options["feeds"]["test"]}
    )

    return MNISTBasePerformance(feeds["test"], threshold=options["threshold"])

Evaluate a single experiment:
* load models: at the end of `dense`, just prior to `fine-tune`, just after `fine-tune`
* get each model's compression rate and accuracy on `test`

In [None]:
from cplxpaper.mnist.performance import MNISTBasePerformance

@file_cache(f"./cache__{report_name}.pk")
def evaluate_experiment(folder):
    assert False, folder
    device = torch.device(device_)

    # get the models and the scorer
    options, models = load_experiment(folder)
    scorer = get_scorer(options, device=device)

    # score each model on device
    scores = []
    for name, model in models.items():
        model.to(device)
        scores.append((name, scorer(model.eval())))
        model.cpu()

    return options, scores

To debug
```python
@file_cache(f"./cache__{report_name}.pk")
def evaluate_experiment(folder):
    print(folder)
    assert False
```

Extract the score from the scorers' output.

In [None]:
def get_score(score):
    # something is horribly wrong if this fails...
    assert score["pre-fine-tune"]["sparsity"] == score["post-fine-tune"]["sparsity"]

    metrics = {k: dict_get_one(v, "pooled_average_precision", "accuracy")
               for k, v in score.items()}

    n_zer, n_par = map(sum, zip(*score["pre-fine-tune"]["sparsity"].values()))
    return {
        **metrics,
        "compression": n_par / (n_par - n_zer)
    }

<br>

## Read grids

Grids

In [None]:
if False:
    PREFIX = "legacy__"
    grids = [
        "./grids_joint/legacy__mnist-like__00",
        "./grids_joint/legacy__mnist-like__01",
        "./grids_joint/legacy__mnist-like__02",
        "./grids_joint/legacy__mnist-like__03",
        "./grids_joint/legacy__mnist-like__04",
    ]

else:
    PREFIX = ""
    grids = [
        "./grids/mnist-like__real-vs-cplx__00",
        "./grids/mnist-like__real-vs-cplx__01",
        "./grids/mnist-like__real-vs-cplx__02",
        "./grids/mnist-like__real-vs-cplx__03",
#         "./grids/mnist-like__real-vs-cplx__04",
    ]

Evaluate several grids and join them

In [None]:
from collections import defaultdict

output, joint_grid = [], defaultdict(set)
for grid in grids:
    grid, results = evaluate_grid(grid)
    output.extend(results)
    for k, v in grid.items():
        joint_grid[k].update(v)

Alter the recovered grid

In [None]:
grid = set(field for field in joint_grid
           if not any(map(field.__contains__, {
                # service fields
                "__name__", "__timestamp__", "__version__", "device",

                # ignore global model class settings
                "model__cls",

                # upcast is a service variable, which only complex models have
                #  and it is usually mirrored in `features` settings.
                "__upcast"
            })))

grid.update({
    "stages__sparsify__model__cls",
    "threshold"  # ensure threshold is included
})

<br>

## Build the report

Index by the experiment **grid--folder** and prepare fields

In [None]:
experiments, options, *rest = zip(*output)

# experiment paths are absolute!
master_index = pd.Index(experiments, name="experiment", dtype=str)
master_index = master_index.str.replace(os.path.commonpath(experiments) + "/", "")

master_index = master_index.str.rsplit("/", 1, expand=True)
master_index.rename(["grid", "experiment"], inplace=True)

Gradually construct the table of options

In [None]:
parameters = pd.DataFrame(index=master_index)

Assign proper tags to models

In [None]:
grid = [k for k in grid if not k.startswith((
    "model__",
    "stages__sparsify__model__"
))]

def get_model_tag(opt):
    # extract the class name
    cls = opt["stages__sparsify__model__cls"]
    cls = re.sub("^<class '.*?\.models\.(.*?)'>$", r"\1", cls)

    # get the model kind: real/complex
    if not cls.startswith(("real.", "complex.")):
        raise ValueError("Unknown model type.")

    if cls.startswith("real."):
        kind, cls = "R", cls[5:]
    elif cls.startswith("complex."):
        kind, cls = "C", cls[8:]

    # handle real `double` and cplx `half`
    if kind == "R" and opt.get("model__double", False):
        kind = kind + "*2"
    elif kind == "C" and opt.get("model__half", False):
        kind = kind + "/2"

    # get method
    if not cls.endswith(("VD", "ARD")):
        raise ValueError("Unknown Bayesian method.")

    if cls.endswith("VD"):
        method, cls = "VD", cls[:-2]
    elif cls.endswith("ARD"):
        method, cls = "ARD", cls[:-3]

    return {"model": cls, "kind": kind, "method": method}

parameters = parameters.join(pd.DataFrame([
    *map(get_model_tag, options)
], index=master_index))

Deal with features

In [None]:
grid = [k for k in grid if not k.startswith("features__")]

def get_features(opt):
    cls = opt["features__cls"]
    cls = re.sub("^<class '.*?\.feeds\.(.*?)'>$", r"\1", cls).lower()
    
    if cls == "feedfourierfeatures":
        features = "fft"

    elif cls == "feedrawfeatures":
        features = "raw"
    else:
        raise ValueError("Unknown input features.")

    return {"features": features}

parameters = parameters.join(pd.DataFrame([
    *map(get_features, options)
], index=master_index))

Handle dataset family

In [None]:
grid = [k for k in grid if not k.startswith("datasets__")]

def get_dataset(opt):
    cls = dict_get_one(opt, "datasets__musicnet-test-128__cls", "datasets__test__cls")
    assert cls is not None
    cls = re.sub("^<class '.*?\.(?:mnist|musicnet)\.dataset\.(.*?)'>$", r"\1", cls).lower()

    return {"dataset": cls.replace("_test", "")}

parameters = parameters.join(pd.DataFrame([
    *map(get_dataset, options)
], index=master_index))

Other fields' preprocessing.

In [None]:
pass

Only the essential experiment parameters should have remained by now.

In [None]:
parameters = parameters.join(pd.DataFrame([
    {g: opt[g] for g in grid} for opt in options
], index=master_index))

grid

Now collect the metrics. We need:
* **accuracy** performance on `dense`, `pre-fine-tune` and `post-fine-tune`
* **compression rate** from a `fine-tune` stage

In [None]:
scores, *tail = rest
assert not tail

metrics = pd.DataFrame([
    get_score(dict(score)) for score in scores
], index=master_index)

Join the tables and rename unfotunate columns.

In [None]:
df_main = parameters.join(metrics).rename(columns={
    "stages__sparsify__objective__kl_div": "kl_div"
})

Group by all fileds except for `kl_div` coefficient:
* `model`, `kind`, `method`, `dataset` and `threshold`

In [None]:
print([f for f in parameters.columns if "kl_div" not in f])
fields = [
    'dataset',
    'method',
#     'model',  # models are plotted together
    'features',
#     'kind',  # use kind for joint plotting
    'threshold',
]
grouper = df_main.groupby(fields)

A service plotting function to darkern the specified colour

In [None]:
from matplotlib.ticker import FormatStrFormatter, FuncFormatter


def darker(color, a=0.5):
    """Adapted from this stackoverflow question_.
    .. _question: https://stackoverflow.com/questions/37765197/
    """
    from matplotlib.colors import to_rgb
    from colorsys import rgb_to_hls, hls_to_rgb

    h, l, s = rgb_to_hls(*to_rgb(color))
    return hls_to_rgb(h, max(0, min(a * l, 1)), s)

Model colour coding scheme:
* fft and raw features are never mixed

In [None]:
kind_model_color = {  # fft | raw
    # tab10 colours are paired! use this to keep similar models distinguishable
    ("R*2", "SimpleConvModel"   ): "C0", ("R"  , "SimpleConvModel"   ): "C0",
    ("C"  , "SimpleConvModel"   ): "C1", ("C/2", "SimpleConvModel"   ): "C1",

    ("R*2", "TwoLayerDenseModel"): "C2", ("R"  , "TwoLayerDenseModel"): "C2",
    ("C"  , "TwoLayerDenseModel"): "C3", ("C/2", "TwoLayerDenseModel"): "C3",
}

y-axis limits for clearer picture

In [None]:
ylim_pairs = {
    "mnist": (0.93, 0.995),
    "kmnist": (0.65, 0.945),
    "fashionmnist": (0.80, 0.90),
    "emnist_letters": (0.75, 0.91),
}

The common trade-off plotting procedure

In [None]:
from matplotlib.ticker import FormatStrFormatter, FuncFormatter
from matplotlib.collections import LineCollection

def plot_performance_compression_plot(param, data):
    """Produce the performance compression plot.

    Things tried
    ------------
    Tried saturation contrasting (poor), used quiver (arrow heads are confusing)
    superimposed on to scatter (not good), using marker styles (bad), good idea
    was to swap C1 and C2 above so that related models (that have similar performance)
    have contrasting colours. used quiver alone (poor). Finally decided to use plain
    lines. Hopefully this conveys the that pre/post fine-tune may differ.
    """
    filename = (
        PREFIX + "{method}__{dataset}__{features}__{threshold}"
    ).format(**group)

    title = "Trade-off on {dataset} ({features}) by {method} ($\\tau = {threshold}$)".format(**group)
    fig, ax = plt.subplots(1, 1, figsize=(8, 3.5), dpi=300)

    ax.set_title(title)
    ax.set_xscale("log")
    ax.set_ylabel("accuracy")
    ax.set_xlabel("compression")
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, p: f"$\\times${int(x):d}"))
    
    ax.set_ylim(ylim_pairs[group["dataset"]])
    ax.set_xlim(1, 1.5e3)

    # grid and the adequacy zone
    ax.grid(axis='x', which="major", c="k", alpha=0.1, zorder=-20)
    ax.axvspan(50, 500, color="k", alpha=0.05, zorder=-10)

    # draw the scatter plot of compression-accuracy pairs
    for (kind, model), df in data.groupby(["kind", "model"]):
        label = f"{kind} {model}"  # .format(**group, kind=kind)
        color = kind_model_color[kind, model]
        
        # draw the `dense` min-max band and median
        patch = ax.axhline(
            df["dense"].median(), color=darker(color, 1.5),
            alpha=0.75, lw=1, zorder=-10)
        ax.axhspan(
            df["dense"].min(), df["dense"].max(), color=darker(color, 1.7),
            alpha=0.15, lw=0, zorder=-15)

        # performance jump using line collection and final endpoint scatter
        c = df['compression']
        z, a = df['post-fine-tune'], df['pre-fine-tune']
        ax.add_collection(LineCollection(
            np.array([*zip(zip(c, a), zip(c, z))]),
            colors=[darker(color, 0.5)], lw=1, alpha=0.5, zorder=+5
        ))
        ax.scatter(c, z, c="k", edgecolor=[color], lw=1, s=5,
                   marker="o", label=label, alpha=1.0, zorder=+10)

    
    ax.legend(ncol=2, loc="lower left")
    return os.path.join(report_target, filename), fig

Plot for all groups.

In [None]:
for key, df in tqdm.tqdm(grouper, desc="populating plots"):
    df = df.drop(columns=fields)
    group = dict(zip(fields, key))

    filename, fig = plot_performance_compression_plot(group, df)
    fig.patch.set_alpha(1.0)
    fig.savefig(filename + ".pdf", dpi=300)

#     plt.show()
    plt.close()
    # break

What is inside?

In [None]:
cmp = df_main.compression

df = df_main.loc[(90 <= cmp) & (cmp <= 200)]
df = df.sort_values(["method", "kl_div"])

df

In [None]:
[os.path.join(*exp) for exp in df.index]

<br>

In [None]:
assert False

<br>