# Analyze optimization settings

Import Python modules:

In [1]:
import altair as alt

import pandas as pd

Read the data:

In [2]:
csv_file = "results/opt_settings/aggregated_fit_results.csv"

print(f"Reading results from {csv_file}")

data = pd.read_csv(csv_file)
data

Reading results from results/opt_settings/aggregated_fit_results.csv


Unnamed: 0,loss,time,corr_prob_escape,corr_mut_escape,noise,sitefirst,collapse,reg_escape_weight,reg_spread_weight
0,19.327,1892.100,0.99715,0.87362,exact,sitefirst,collapsed,0.0,0.00
1,71.593,793.360,0.99351,0.80293,exact,sitefirst,collapsed,0.0,0.05
2,123.900,423.450,0.99066,0.74527,exact,sitefirst,collapsed,0.0,0.25
3,147.790,418.940,0.99052,0.74454,exact,sitefirst,collapsed,0.0,0.50
4,182.760,407.990,0.99024,0.73962,exact,sitefirst,collapsed,0.0,1.00
...,...,...,...,...,...,...,...,...,...
235,1210.400,79.251,0.98328,0.58208,noisy,nosite,uncollapsed,0.1,0.00
236,1213.400,100.810,0.98370,0.59669,noisy,nosite,uncollapsed,0.1,0.05
237,1225.000,69.333,0.98460,0.68272,noisy,nosite,uncollapsed,0.1,0.25
238,1238.800,78.187,0.98522,0.62480,noisy,nosite,uncollapsed,0.1,0.50


Define the columns that have:

 1. evaluation metrics
 2. datasets
 3. optimization parameters

In [3]:
metrics = ["loss", "time", "corr_prob_escape", "corr_mut_escape"]
datasets = ["noise"]
params = [c for c in data.columns.tolist() if c not in datasets and c not in metrics]
assert set(data.columns) == set([*metrics, *datasets, *params])

## Fitting a site model first is beneficial
First check if it's beneficial to fit a site model first.
To determine this, we plot the difference in each metric between using the site model and **not** using a site model.

In terms of model fitting, the site model is essentially universally better: 

 - The loss is always either better (more negative) for the site model or about the same.
 - The correlations with the true values is nearly always better (larger) for the site model.
 - Fitting a site model first does usually increase the fitting time, but the increases are only large for the exact data without regularization on escape values, which isn't really the most plausible dataset.

In [4]:
dfs = []
_ = params.remove("sitefirst")
for metric in metrics:
    df = (
        data.pivot_table(index=datasets + params, columns="sitefirst", values=metric)
        .assign(diff=lambda x: x["sitefirst"] - x["nosite"])
        .drop(columns=["nosite", "sitefirst"])
        .rename(columns={"diff": metric})
    )
    dfs.append(df)
sitemodel_df = (
    pd.concat(dfs, axis=1)
    .reset_index()
    .melt(
        id_vars=datasets + params,
        var_name="metric",
        value_name="increase from site first",
    )
)

assert len(datasets) == 1, "plot only works for one dataset"

sitemodel_chart = (
    alt.Chart(sitemodel_df)
    .encode(
        x=datasets[0],
        y="increase from site first",
        tooltip=params,
    )
    .mark_boxplot()
    .properties(width=75, height=200)
    .facet(column="metric")
    .resolve_scale(y="independent")
)

sitemodel_chart

Given above, we will only continue with analyzing the case where a site model is fit first.
We also drop loss, because for all the subsequent analyses we are comparing models with different parameters so loss isn't a meaningful metric:

In [5]:
data = (
    data.query('sitefirst == "sitefirst"')
    .drop(columns=["sitefirst", "loss"])
    .reset_index(drop=True)
)
_ = metrics.remove("loss")

## Collapsing identical variants is probably helpful and at least doesn't hurt
Now check if it's helpful to collapse identical variants.
The results below show that it seems probably helpful, and at least not deleterious.
It definitely shortens the time. The correlations with the actual values seems about equivalent with or without collapsing.
So overall, collapsing seems to reduce fitting time without coming at a cost to accuracy.

In [6]:
dfs = []
params.remove("collapse")
for metric in metrics:
    df = (
        data.pivot_table(index=datasets + params, columns="collapse", values=metric)
        .assign(diff=lambda x: x["collapsed"] - x["uncollapsed"])
        .drop(columns=["collapsed", "uncollapsed"])
        .rename(columns={"diff": metric})
    )
    dfs.append(df)
collapse_df = (
    pd.concat(dfs, axis=1)
    .reset_index()
    .melt(
        id_vars=datasets + params,
        var_name="metric",
        value_name="increase from collapsing",
    )
)

assert len(datasets) == 1, "plot only works for one dataset"

collapse_chart = (
    alt.Chart(collapse_df)
    .encode(
        x=datasets[0],
        y="increase from collapsing",
        tooltip=params,
    )
    .mark_boxplot()
    .properties(width=75, height=200)
    .facet(column="metric")
    .resolve_scale(y="independent")
)

collapse_chart

Given above we will continue with just the collapsed variants:

In [7]:
data = (
    data.query('collapse == "collapsed"')
    .drop(columns="collapse")
    .reset_index(drop=True)
)

## Look at how other parameters affect accuracy and fitting time

In [8]:
params_data = data.melt(
    id_vars=datasets + metrics,
    value_vars=params,
    var_name="parameter",
    value_name="parameter_value",
).melt(
    id_vars=[*datasets, "parameter", "parameter_value"],
    value_vars=metrics,
    var_name="metric",
    value_name="metric_value",
)

params_data.head()

Unnamed: 0,noise,parameter,parameter_value,metric,metric_value
0,exact,reg_escape_weight,0.0,time,1892.1
1,exact,reg_escape_weight,0.0,time,793.36
2,exact,reg_escape_weight,0.0,time,423.45
3,exact,reg_escape_weight,0.0,time,418.94
4,exact,reg_escape_weight,0.0,time,407.99


In [9]:
params_chart = (
    alt.Chart(params_data)
    .encode(
        x="parameter_value",
        y=alt.Y(
            "metric_value",
            scale=alt.Scale(nice=False, zero=False),
        ),
        color="noise",
        tooltip=[*datasets, "metric", "metric_value", "parameter", "parameter_value"],
    )
    .mark_point()
    .facet(
        row="metric",
        column="parameter",
    )
    .resolve_scale(x="independent", y="independent")
)

params_chart