# Report and plots `fig:mnist-like__method_comparison`

Evaluation of a single experiment:

* load models: at the end of `dense`, just prior to `fine-tune`, just after `fine-tune`
  * the model that existed just before the `fine-tune` stage is recovered from `sparsify`
  and the sparsity threshold, specified in the experiment
* get each model's compression rate and accuracy on `test`

In [None]:
import os
import tqdm

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore")

Fetch any one of the given keys from a dict, prioritizing from left to right

In [None]:
from cplxpaper.auto.reports.utils import dict_get_one

Load the results, pickled sequentially.

In [None]:
from cplxpaper.auto.reports.utils import restore

Load the report constructed on the grid of experiments.

In [None]:
from cplxpaper.auto.parameter_grid import reconstruct_grid

def build_report(filename):
    report = tqdm.tqdm(restore(filename), desc="analyzing report data")
    workers, results = zip(*report)    
    if not results:
        return {}, []

    # compute the grid and flatten the manifests
    experiments, options, *results = zip(*results)
    full_grid, flat_options = reconstruct_grid(options)

    return full_grid, [*zip(experiments, flat_options, *results)]

Decide on the target folder and computation cache.

In [None]:
report_name = "figure__mnist-like__method_comparison"

report_target = os.path.normpath(os.path.abspath(os.path.join(
    "../../assets", report_name
)))

os.makedirs(report_target, exist_ok=True)

Extract the score from the scorers' output.

In [None]:
def get_score(score):
    # something is horribly wrong if this fails...
    assert score["pre-fine-tune"]["sparsity"] == score["post-fine-tune"]["sparsity"]

    metrics = {k: dict_get_one(v, "pooled_average_precision", "accuracy")
               for k, v in score.items()}

    n_zer, n_par = map(sum, zip(*score["pre-fine-tune"]["sparsity"].values()))
    return {
        **metrics,
        "compression": n_par / (n_par - n_zer)
    }

<br>

## Build the table

In [None]:
PREFIX = "appendix__"  # "appendix__cmp__"  # "legacy__"

Grids

```python
reports = [
    # "legacy__"
    "./grids_joint/report__trade-off.pk",             # VD R/C only experiments (compatible)
    # "appendix__" and "appendix__cmp__"
    "./grids-20200213/report__trade-off.pk",          # full experiment on all datasets
    
    # unused
    "./grids-20200213/grids-20200210__trade-off.pk",  # full expriment on fashionmnist only
]

comparable_pairs = {
    "raw": ("R"  , "C/2"),  # (raw) based on the number of parameters (see VC dim of complex hyperplane)
    "fft": ("R*2", "C"  ),  # (fft) ditto
#     "raw": ("R"  , "C"  ),  # (fft, raw) just for the sake of it
#     "fft": ("R"  , "C"  ),  # 
}
```

Incomparable models in the appendix

In [None]:
if PREFIX == "appendix__":
    reports = [
        "./grids-20200213/report__trade-off.pk"
    ]

    comparable_pairs = {"raw": ("R"  , "C"  ), "fft": ("R"  , "C"  )}

Comparable models in the appendix

In [None]:
if PREFIX == "appendix__cmp__":
    reports = [
        "./grids-20200213/report__trade-off.pk"
    ]

    comparable_pairs = {"raw": ("R"  , "C/2"), "fft": ("R*2", "C"  )}

R-C plots in the main text with VD experiment

In [None]:
if PREFIX == "legacy__":
    reports = [
        "./grids_joint/report__trade-off.pk"  # R-C comparison
    ]

    comparable_pairs = {"raw": ("R"  , "C"  ), "fft": ("R"  , "C"  )}

Evaluate several grids and join them

In [None]:
from collections import defaultdict

output, joint_grid = [], defaultdict(set)
for report in reports:
    grid, results = build_report(report)
    output.extend(results)
    for k, v in grid.items():
        joint_grid[k].update(v)

Alter the recovered grid

In [None]:
grid = set(field for field in joint_grid
           if not any(map(field.__contains__, {
                # service fields
                "__name__", "__timestamp__", "__version__", "device",

                # ignore global model class settings
                "model__cls",

                # upcast is a service variable, which only complex models have
                #  and it is usually mirrored in `features` settings.
                "__upcast"
            })))

grid.update({
    "stages__sparsify__model__cls",
    "threshold"  # ensure threshold is included
})

<br>

Index by the experiment **grid--folder** and prepare fields

In [None]:
experiments, options, *rest = zip(*output)

# experiment paths are absolute!
master_index = pd.Index(experiments, name="experiment", dtype=str)
master_index = master_index.str.replace(os.path.commonpath(experiments), "*")

master_index = master_index.str.rsplit("/", 1, expand=True)
master_index.rename(["grid", "experiment"], inplace=True)

Gradually construct the table of options

In [None]:
parameters = pd.DataFrame(index=master_index)

Assign proper tags to models

In [None]:
from cplxpaper.auto.reports.utils import get_model_tag

grid = [k for k in grid if not k.startswith((
    "model__",
    "stages__sparsify__model__"
))]

parameters = parameters.join(pd.DataFrame([
    *map(get_model_tag, options)
], index=master_index))

Deal with features

In [None]:
from cplxpaper.auto.reports.utils import get_features_tag

grid = [k for k in grid if not k.startswith("features__")]

parameters = parameters.join(pd.DataFrame([
    *map(get_features_tag, options)
], index=master_index))

Handle dataset family

In [None]:
from cplxpaper.auto.reports.utils import get_dataset_tag

grid = [k for k in grid if not k.startswith("datasets__")]

parameters = parameters.join(pd.DataFrame([
    *map(get_dataset_tag, options)
], index=master_index))

Other fields' preprocessing.

In [None]:
pass

Only the essential experiment parameters should have remained by now.

In [None]:
parameters = parameters.join(pd.DataFrame([
    {g: opt[g] for g in grid} for opt in options
], index=master_index))

grid

Now collect the metrics. We need:
* **accuracy** performance on `dense`, `pre-fine-tune` and `post-fine-tune`
* **compression rate** from a `fine-tune` stage

In [None]:
scores, *tail = rest
assert not tail

metrics = pd.DataFrame([
    get_score(dict_get_one(score, "test", "test-256")) for score in scores
], index=master_index)

Join the tables and rename unfotunate columns.

In [None]:
df_main = parameters.join(metrics).rename(columns={
    "stages__sparsify__objective__kl_div": "kl_div"
})

Group by all fileds except for `kl_div` coefficient:
* `model`, `kind`, `method`, `dataset` and `threshold`

In [None]:
print([f for f in parameters.columns if "kl_div" not in f])
fields = [
    'dataset',
#     'method',
    'model',  # models are plotted together
    'features',
#     'kind',  # use kind for joint plotting
    'threshold',
]
grouper = df_main.groupby(fields)

A service plotting function to darkern the specified colour

In [None]:
from matplotlib.ticker import FormatStrFormatter, FuncFormatter


def darker(color, a=0.5):
    """Adapted from this stackoverflow question_.
    .. _question: https://stackoverflow.com/questions/37765197/
    """
    from matplotlib.colors import to_rgb
    from colorsys import rgb_to_hls, hls_to_rgb

    h, l, s = rgb_to_hls(*to_rgb(color))
    return hls_to_rgb(h, max(0, min(a * l, 1)), s)

The common trade-off plotting procedure

In [None]:
from matplotlib.ticker import FormatStrFormatter, FuncFormatter
from matplotlib.collections import LineCollection

def plot_performance_compression_plot(group, data):
    """Produce the performance compression plot.

    Things tried
    ------------
    Tried saturation contrasting (poor), used quiver (arrow heads are confusing)
    superimposed on to scatter (not good), using marker styles (bad), good idea
    was to swap C1 and C2 above so that related models (that have similar performance)
    have contrasting colours. used quiver alone (poor). Finally decided to use plain
    lines. Hopefully this conveys the that pre/post fine-tune may differ.
    """
    title = "Trade-off on {dataset} for {model} ({features}) ($\\tau = {threshold}$)".format(**group)
    fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=300)

    ax.set_title(title)
    ax.set_xscale("log")
    ax.set_ylabel("accuracy")
    ax.set_xlabel("compression")
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, p: f"$\\times${int(x):d}"))
    
    ax.set_ylim(ylim_pairs[group["dataset"], group["model"]])
    ax.set_xlim(1, 1.5e3)

    # grid and the adequacy zone
    ax.grid(axis='x', which="major", c="k", alpha=0.1, zorder=-20)
    ax.axvspan(50, 500, color="k", alpha=0.05, zorder=-10)

    # draw the scatter plot of compression-accuracy pairs
    for (kind, method), df in data.groupby(["kind", "method"]):
        label = f"{kind} {method}"  # .format(**group, kind=kind)
        color = kind_method_color[kind, method]

        # draw the `dense` min-max band and median
        patch = ax.axhline(
            df["dense"].median(), color=darker(color, 1.5),
            alpha=0.75, lw=1, zorder=-10)
        ax.axhspan(
            df["dense"].min(), df["dense"].max(), color=darker(color, 1.7),
            alpha=0.15, lw=0, zorder=-15)

        # performance jump using line collection and final endpoint scatter
        c = df['compression']
        z, a = df['post-fine-tune'], df['pre-fine-tune']
        ax.add_collection(LineCollection(
            np.array([*zip(zip(c, a), zip(c, z))]),
            colors=[darker(color, 0.5)], lw=1, alpha=0.125, zorder=+5
        ))
        ax.scatter(c, z, c="k", edgecolor=[color], lw=1, s=5,
                   marker="o", label=label, alpha=1.0, zorder=+10)

    ax.legend(ncol=2, loc="lower left")
    return fig

Method colour coding scheme:
* fft and raw features are never mixed

In [None]:
kind_method_color = {  # fft | raw
    # tab10 colours are paired! use this to keep similar models distinguishable
    ("R*2", "VD"   ): "C0", ("R"  , "VD"   ): "C0",
    ("C"  , "VD"   ): "C2", ("C/2", "VD"   ): "C2",

    ("R*2", "ARD"): "C1", ("R"  , "ARD"): "C1",
    ("C"  , "ARD"): "C3", ("C/2", "ARD"): "C3",
}

y-axis limits for clearer picture

In [None]:
ylim_pairs = {
    ("emnist_letters", 'TwoLayerDenseModel'): (0.75, 0.87),
    ("emnist_letters", 'SimpleConvModel'): (0.87, 0.91),
    ("fashionmnist", 'TwoLayerDenseModel'): (0.81, 0.87),
    ("fashionmnist", 'SimpleConvModel'): (0.84, 0.90),
    ("kmnist", 'TwoLayerDenseModel'): (0.70, 0.875),
    ("kmnist", 'SimpleConvModel'): (0.85, 0.945),

    ("mnist", 'TwoLayerDenseModel'): (0.93, 0.98),
    ("mnist", 'SimpleConvModel'): (0.975, 0.995),
}

Plot for all groups.

In [None]:
for key, df in tqdm.tqdm(grouper, desc="populating plots"):
    group, df = dict(zip(fields, key)), df.drop(columns=fields)

    # comparability filter
    pair = comparable_pairs[group["features"]]
    df = df.loc[df.kind.apply(pair.__contains__)]

    fig = plot_performance_compression_plot(group, df)
    fig.patch.set_alpha(1.0)

    filename = (
        PREFIX + "{model}__{dataset}__{features}__{threshold}"
    ).format(**group)
    fig.savefig(os.path.join(report_target, filename + ".pdf"), dpi=300)

#     plt.show()
    plt.close()

<br>

In [None]:
assert False

<br>

What is inside?

In [None]:
df_main.loc[(
    df_main.dataset=="mnist"
)&(
    df_main.model=="SimpleConvModel"
)&(
    df_main.features=="raw"
)].sort_values(["kind", "model", "method", "kl_div"])

In [None]:
df = df_main.groupby(["model", "kind", "features", "method", "kl_div"])['compression'].mean()

In [None]:
mod = {k: df.xs(k, level="kind") for k, d in df.groupby(["kind"])}

In [None]:
ratio_fft = mod["C"].loc[:, 'fft'] / mod["R*2"].loc[:, 'fft']
ratio_fft.groupby(["model", "method"]).plot()
plt.legend()

In [None]:
ratio_raw = mod["C/2"].loc[:, 'raw'] / mod["R"].loc[:, 'raw']
ratio_raw.groupby(["model", "method"]).plot()
plt.legend()

In [None]:
ratio_raw = mod["C"].loc[:, 'raw'] / mod["R"].loc[:, 'raw']
ratio_raw.groupby(["model", "method"]).plot()
plt.legend()

In [None]:
ratio_fft = mod["C"].loc[:, 'fft'] / mod["R"].loc[:, 'fft']
ratio_fft.groupby(["model", "method"]).plot()
plt.legend()

In [None]:
df = (df_main.loc[(
    df_main.dataset=="mnist"
)&(
    df_main.model=="SimpleConvModel"
)&(
    df_main.method=="VD"
)&(
    df_main.kl_div > .01
)&(
    df_main.kl_div < .1
)]\
.groupby(["kl_div", "model", "method", "features", "kind"]).mean()\
.sort_values(["kl_div", "model", "method", "features", "kind"])
)

In [None]:
df

In [None]:
df = df_main.groupby(["kl_div", "model", "method", "features", "kind"]).mean()

In [None]:
(df_main.loc[(
    df_main.dataset=="mnist"
)&(
    df_main.model=="SimpleConvModel"
)&(
    df_main.method=="VD"
)&(
    df_main.kl_div > .1
)]\
.groupby(["kl_div", "model", "method", "features", "kind"]).mean()\
.sort_values(["kl_div", "model", "method", "features", "kind"])
)

In [None]:
cmp = df_main.compression

df = df_main.loc[(90 <= cmp) & (cmp <= 200)]
df = df.sort_values(["method", "kl_div"])

df

In [None]:
[os.path.join(*exp) for exp in df.index]

<br>