# Imports

Import the packages we need

In [None]:
import numpy as np
import pickle
import pandas as pd
from sqlite3 import connect
import json

Weird little matplotlib dance...

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame(data={ 'x': [1], 'y': [1] })
sns.scatterplot(data=df, x="x", y="y")

In [None]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

Create `graphs` folder if it not exists

In [None]:
import os
if not os.path.exists('graphs'):
    os.makedirs('graphs')

# Functions

Here we define some helper functions that are required

In [None]:
# https://stackoverflow.com/a/56253636/3142952

def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique))

In [None]:
def plot_scatter(y_test, yhat, var, name):
    def get_x_y_color():
        if y_test.shape[1] == 1 and var == 'agb':
            return (y_test[:,0], yhat[:,0], 'green')
        
        if var == 'lai':
            return (y_test[:,0], yhat[:,0], 'blue')
        elif var == 'cm':
            return (y_test[:,1], yhat[:,1], 'orange')
        elif var == 'agb':
            return (y_test[:,0]*y_test[:,1], yhat[:,0]*yhat[:,1], 'green')
    
    fig, ax = plt.subplots(figsize=(6, 4))
    
    x, y, color = get_x_y_color()
    
    plt.xlim([np.min(x), np.max(x)])
    plt.ylim([np.min(x), np.max(x)])
    
    sns.scatterplot(x=x, y=y, color=color, palette='pastel')
    plt.xlabel("Actual value")
    plt.ylabel("Predicted value")
    
    plt.savefig('graphs/{}.pdf'.format(name))

In [None]:
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def explode_dicts(train, test):
    return flatten({
        'train': train,
        'test': test,
    })

In [None]:
def map_df_table(df, pairs):
    df = df.groupby('model_name_alt', sort=False).describe()
    dfs = [ df[tpl[0]].reset_index() for tpl in pairs ]

    def maprow(row):
        if np.isnan(row['mean']):
            return np.nan

        return "{:,.4f} ($\sigma$ = {:,.4f})".format(row['mean'], row['std'])

    items = []
    for i in range(len(dfs[0])):
        xs = [ x.iloc[i] for x in dfs ]
        xs_mapped = [ maprow(x) for x in xs ]

        items.append([ xs[0]['model_name_alt'], *xs_mapped ])

    df = pd.DataFrame(data=items, columns=['Model', *[tpl[1] for tpl in pairs]], copy=False)
    df = df.set_index('Model')
    return df

# Load data

Connect to the sqlite3 database

In [None]:
conn = connect('./db.db')

If this is uncommented we can use this instead of gathering the data from the sqlite3 db.
This file is written in the code cell after the next one (the one that fetches the data from the database)

In [None]:
#all_df = pd.read_csv('./cross-results-test.csv', index_col=0)

Fetch the data from the database, this can be skipped if the previous cell is ran.

This cell takes a little while to run, so a csv file is written to skip this if you want to rerun the notebook.

In [None]:
%%time

all_matrix = []
def apply(cursor):
    for row in cursor:
        all_matrix.append({
            'model': row[0],
            'tag': row[1],
            'param': row[2],
            'train_time': row[3],
            'run_time': row[4],
            **flatten({
                'art': {
                    'train': json.loads(row[5]),
                    'test': json.loads(row[6]),
                },
                'field': {
                    'test': json.loads(row[7] or '{}'),
                },
            }),
            'run_id': row[8],
            'method_specific': json.loads(row[9]),
        })

apply(conn.execute(
'''
SELECT
	run.model,
	run.tag,
	JSON_EXTRACT(art_row.res, '$.param') AS param,
	JSON_EXTRACT(art_row.res, '$.taken') AS train_time,
	JSON_EXTRACT(field_row.res, '$.taken') AS run_time,
	JSON_EXTRACT(art_row.res, '$.train_err') AS art_train_err,
	JSON_EXTRACT(art_row.res, '$.test_err') AS art_test_err,
	JSON_EXTRACT(field_row.res, '$.test_err') AS field_test_err,
	run.run_id,
    JSON_EXTRACT(art_row.res, '$.method_specific') AS art_method_specific
FROM rows AS art_row
LEFT JOIN rerun_rows AS field_row
	ON field_row.run_id=art_row.run_id AND field_row.iter=art_row.iter
JOIN runs AS run
	ON art_row.run_id=run.run_id
WHERE run.test_size = 0.5
'''
))
    
all_df = pd.DataFrame(data=all_matrix, copy=False)
all_df.to_csv('./cross-results-test.csv')

- Convert MAPE values from fractions to percentages.
- Fix the RMSE values being wrong, the database contains MSE values. So we convert them to RMSE by taking the square root of all the values.

In [None]:
all_df[all_df.filter(regex='.+_mape_.+').columns] *= 100
all_df[all_df.filter(regex='.+_rmse_.+').columns] **= 1/2

Per row:
- Set the model name column
- Set the alternative model name column (this one contains 'single-output')

Then the rows are sorted by a specified order for the models, so they all figures follow the same ordering of the models as the thesis' text is structured around.

In [None]:
def handle_row(x):
    model = x.model

    if model == 'rf':
        return ('Random forests', 'Random forests', 0)
    elif model == 'gp':
        return ('Gaussian process', 'Gaussian process', 1)
    elif model == 'ak':
        return ('AutoKeras', 'AutoKeras', 2)
    elif model == 'ask':
        return ('Auto-sklearn', 'Auto-sklearn', 3)
    elif model == 'rf_agb':
        return ('Random forests', 'Random forests single-output', 4)
    elif model == 'gp_agb':
        return ('Gaussian process', 'Gaussian process single-output', 5)
    elif model == 'ak_agb':
        return ('AutoKeras', 'AutoKeras single-output', 6)
    elif model == 'ask_agb':
        return ('Auto-sklearn', 'Auto-sklearn single-output', 7)

for i in all_df.index:
    (model_name, model_name_alt, model_index) = handle_row(all_df.iloc[i])
    all_df.at[i, 'model_name'] = model_name
    all_df.at[i, 'model_name_alt'] = model_name_alt
    all_df.at[i, 'model_index'] = model_index
    
all_df.sort_values(by='model_index', inplace=True, kind='stable')

Create some useful dataframes that point to `all_df`:
- `baseline_df` contains the rows for the baseline models.
- `baseline_df_multi` contains the rows for the multi-output baseline models.
- `baseline_df_single` contains the rows for the single-output baseline models.

In [None]:
baseline_df = all_df.query('((model != "ak" & model != "ak_agb") & param == 16000) | ((model == "ak" | model == "ak_agb") & param == 250)')
baseline_df_multi = baseline_df.query('model != "ak_agb" and model != "ask_agb" and model != "rf_agb" and model != "gp_agb"')
baseline_df_single = baseline_df.query('model == "ak_agb" or model == "ask_agb" or model == "rf_agb" or model == "gp_agb"')

Get the raw rows for the baseline models (we call these rows 'items'), these rows contain method-specific results which are used in the auxilary results graphs.

The function `get_items` is also used in some places to get the data for _all_ the models, not just the baseline.

`get_items` returns a generator, so one can efficiently use all the results with low memory overhead.

In [None]:
def get_items(model, param):
    query = '''
    SELECT
        art_row.res,
        field_row.res
    FROM rows AS art_row
    LEFT JOIN rerun_rows AS field_row
        ON field_row.run_id=art_row.run_id AND field_row.iter=art_row.iter
    JOIN runs AS run
        ON art_row.run_id=run.run_id
    WHERE
        run.test_size = 0.5
        AND run.model = "{model}"
        AND tag = "norm"
    '''.format(model=model)
    if param is not None:
        query += ' AND run.params = {param}'.format(param=param)
    
    cur = conn.execute(query)
    return ({ 'art': json.loads(row[0]), 'field': json.loads(row[1] or '{}') } for row in cur)

In [None]:
ak_items = list(list(get_items('ak', 250)))
ak_agb_items = list(list(get_items('ak_agb', 250)))

In [None]:
ask_items = list(list(get_items('ask', 16000)))
ask_agb_items = list(list(get_items('ask_agb', 16000)))

In [None]:
rf_items = list(get_items('rf', 16000))
rf_agb_items = list(get_items('rf_agb', 16000))

In [None]:
gp_items = list(get_items('gp', 16000))
gp_agb_items = list(get_items('gp_agb', 16000))

Show an example of the method specific results per model

In [None]:
print('ak', ak_items[0]['art']['method_specific'])
print('ask', ask_items[0]['art']['method_specific'])
print('rf', rf_items[0]['art']['method_specific'])
print('gp', gp_items[0]['art']['method_specific'])

# Artificial data

## Combined results

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([5,30])

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
sns.regplot(x="train_time", y="art_test_mape_product", data=df.query('model == "gp"'), logx=True, scatter=False, label="Gaussian process")
sns.regplot(x="train_time", y="art_test_mape_product", data=df.query('model == "rf"'), logx=True, scatter=False, label="Random forests")
sns.regplot(x="train_time", y="art_test_mape_product", data=df.query('model == "ak"'), logx=True, scatter=False, label="AutoKeras")
sns.regplot(x="train_time", y="art_test_mape_product", data=df.query('model == "ask"'), logx=True, scatter=False, label="Auto-sklearn")

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-logx-mean.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([5,30])

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
sns.scatterplot(x="train_time", y="art_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-scatter-mean.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
sns.scatterplot(x="train_time", y="art_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-scatter-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([10,30])

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(x="train_time", y="art_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-line-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak_agb" or model == "ask_agb" or model == "gp_agb" or model == "rf_agb")')
sns.scatterplot(x="train_time", y="art_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-scatter-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([10,30])

df = all_df.query('tag == "norm" and (model == "ak_agb" or model == "ask_agb" or model == "gp_agb" or model == "rf_agb")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(x="train_time", y="art_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-art-time-line-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

plt.xlim([0, 60*40])

ak_df = all_df.query('model == "ak" and tag == "norm"')
sns.scatterplot(x="train_time", y="art_test_mape_product", data=ak_df, label="AutoKeras")
ak_df = None

ask_df = all_df.query('model == "ask" and tag == "norm"')
sns.scatterplot(x="train_time", y="art_test_mape_product", data=ask_df, label="Auto-sklearn")
ask_df = None

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/automl-art-time-scatter.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([0, 60*40])

ak_df = all_df.query('model == "ak" and tag == "norm"')
sns.regplot(x="train_time", y="art_test_mape_product", data=ak_df, label="AutoKeras", logx=True, scatter=False)
ak_df = None

ask_df = all_df.query('model == "ask" and tag == "norm"')
sns.regplot(x="train_time", y="art_test_mape_product", data=ask_df, label="Auto-sklearn", logx=True, scatter=False)
ask_df = None

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/automl-art-time-logx.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([0, 60*40])

ak_df = all_df.query('model == "ak" and tag == "norm"')
sns.regplot(x="train_time", y="art_test_rmse_product", data=ak_df, label="AutoKeras", logx=True, scatter=False)
ak_df = None

ask_df = all_df.query('model == "ask" and tag == "norm"')
sns.regplot(x="train_time", y="art_test_rmse_product", data=ask_df, label="Auto-sklearn", logx=True, scatter=False)
ask_df = None

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/automl-art-time-logx-rmse.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 25])

sns.stripplot(x="model_name", y="art_test_mape_lai", data=baseline_df_multi.query('tag == "norm"'), color='blue', label='LAI')
sns.stripplot(x="model_name", y="art_test_mape_cm", data=baseline_df_multi.query('tag == "norm"'), color='orange', label='cm')
sns.stripplot(x="model_name", y="art_test_mape_product", data=baseline_df_multi.query('tag == "norm"'), color='green', label='AGB')

plt.xlabel("Model")
plt.ylabel("Mean absolute percentage error (%)")

legend_without_duplicate_labels(ax)

plt.savefig('graphs/models-strip-art-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 25])

sns.stripplot(x="model_name", y="art_test_mape_product", data=baseline_df_single.query('tag == "norm"'), color='green', label='AGB')

plt.xlabel("Model")
plt.ylabel("Mean absolute percentage error (%)")

legend_without_duplicate_labels(ax)

plt.savefig('graphs/models-strip-art-single.pdf')

In [None]:
out = map_df_table(
    baseline_df.query('tag == "norm"'),
    [ ('art_test_rmse_lai', 'LAI RMSE')
    , ('art_test_rmse_cm', '$C_m$ [g cm$^{-2}$] RMSE')
    , ('art_test_rmse_product', 'AGB [g cm$^{-2}$] RMSE')
    ]
)
print(out.to_latex(bold_rows=True, na_rep='', escape=False), file = open('graphs/all-art-rmse.tex', 'w'))
out

In [None]:
out = map_df_table(
    baseline_df.query('tag == "norm"'),
    [ ('art_test_mae_lai', 'LAI MAE')
    , ('art_test_mae_cm', '$C_m$ [g cm$^{-2}$] MAE')
    , ('art_test_mae_product', 'AGB [g cm$^{-2}$] MAE')
    ]
)
print(out.to_latex(bold_rows=True, na_rep='', escape=False), file = open('graphs/all-art-mae.tex', 'w'))
out

In [None]:
out = map_df_table(
    baseline_df.query('tag == "norm"'),
    [ ('art_test_mape_lai', 'LAI MAPE')
    , ('art_test_mape_cm', '$C_m$ [g cm$^{-2}$] MAPE')
    , ('art_test_mape_product', 'AGB [g cm$^{-2}$] MAPE')
    ]
)
print(out.to_latex(bold_rows=True, na_rep='', escape=False), file = open('graphs/all-art-mape.tex', 'w'))
out

## AutoKeras

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 50])

df = all_df.query('tag == "norm" and model == "ak"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="train_time", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-time.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 50])

df = all_df.query('tag == "norm" and model == "ak_agb"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-time-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and model == "ak"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-time-agb.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 160])

df = all_df.query('tag == "norm" and model == "ak" and param <= 500')

sns.lineplot(data=df, x="param", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="param", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="param", y="art_test_mape_product", label="AGB")

plt.xlabel("Max trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-max_trials.pdf')

"""
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 20])

df = all_df.query('tag == "norm" and model == "ak" and param <= 500')

sns.regplot(data=df, x="param", y="art_test_mape_lai", robust=True, scatter=False, label="LAI")
sns.regplot(data=df, x="param", y="art_test_mape_cm", robust=True, scatter=False, label="cm")
sns.regplot(data=df, x="param", y="art_test_mape_product", robust=True, scatter=False, label="AGB")

plt.xlabel("Max trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-max_trials.pdf')
"""

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 160])

df = all_df.query('tag == "norm" and model == "ak_agb" and param <= 500')

sns.lineplot(data=df, x="param", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Max trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-max_trials-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

gen = ([r['art']['param'], r['art']['taken']] for r in get_items('ak', None))
df = pd.DataFrame(data=gen, columns=['max_trials', 'time_taken'])
sns.regplot(data=df.query('max_trials <= 1000'), x='max_trials', y='time_taken', logx=True, label="multi output", x_estimator=np.mean)
gen = ([r['art']['param'], r['art']['taken']] for r in get_items('ak_agb', None))
df = pd.DataFrame(data=gen, columns=['max_trials', 'time_taken'])
sns.regplot(data=df.query('max_trials <= 1000'), x='max_trials', y='time_taken', logx=True, label="single output", x_estimator=np.mean)

plt.xlabel("max_trials")
plt.ylabel("Time taken (s)")
ax.legend()

plt.savefig('graphs/autokeras-max_trials-time_taken.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([-1000, 17000])
plt.ylim([-1000, 21000])

sns.scatterplot(data=all_df.query('model == "ak"'), x='param', y='train_time', label='multi-output')
sns.scatterplot(data=all_df.query('model == "ak_agb"'), x='param', y='train_time', label='single-output')

plt.xlabel("max_trials")
plt.ylabel("Time taken (s)")
ax.legend()

plt.savefig('graphs/autokeras-max_trials-time_taken.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([-1000, 17000])
plt.ylim([-1000, 21000])

sns.regplot(data=all_df.query('model == "ak"'), x='param', y='train_time', robust=True, label='multi-output', x_estimator=np.mean)
sns.regplot(data=all_df.query('model == "ak_agb"'), x='param', y='train_time', robust=True, label='single-output', x_estimator=np.mean)

plt.xlabel("max_trials")
plt.ylabel("Time taken (s)")
ax.legend()

plt.savefig('graphs/autokeras-max_trials-time_taken-robust.pdf')

In [None]:
rows = ([r['art']['param'], r['art']['method_specific']['epochs_used']] for r in get_items('ak', None))
df = pd.DataFrame(data=rows, columns=['max_trials', 'epochs_used'])

sns.regplot(data=df, x='max_trials', y='epochs_used', robust=True, scatter=False)

plt.ylabel("max_trials")
plt.ylabel("Epochs used")

plt.savefig('graphs/autokeras-art-max_trials-epochs_used.pdf')

In [None]:
rows = ([r['art']['param'], r['art']['method_specific']['epochs_used']] for r in get_items('ak', None))
df = pd.DataFrame(data=rows, columns=['max_trials', 'epochs_used'])

sns.kdeplot(data=df, x="epochs_used", fill=True, bw_adjust=.8)
sns.rugplot(data=df, x="epochs_used")

plt.xlabel("Epochs used")
plt.xlim(0, np.max(df['epochs_used']))

plt.savefig('graphs/autokeras-art-kde-epochs_used.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm"')
sns.lineplot(data=df.query('model == "ak"'), x="param", y="art_test_mape_product", label="multi output")
sns.lineplot(data=df.query('model == "ak_agb"'), x="param", y="art_test_mape_product", label="single output")

plt.xlabel("max_trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-single-vs-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm"')
sns.regplot(data=df.query('model == "ak"'), x="param", y="art_test_mape_product", label="multi output", logx=True, scatter=False)
sns.regplot(data=df.query('model == "ak_agb"'), x="param", y="art_test_mape_product", label="single output", logx=True, scatter=False)

plt.xlabel("max_trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-single-vs-multi-regress.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and model == "ak"')
sns.lineplot(data=df, x="param", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="param", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="param", y="art_test_mape_product", label="AGB")

plt.xlabel("max_trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-max_trials.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and model == "ak"')
sns.regplot(data=df, x="param", y="art_test_mape_lai", label="LAI", robust=True)
sns.regplot(data=df, x="param", y="art_test_mape_cm", label="cm", robust=True)
sns.regplot(data=df, x="param", y="art_test_mape_product", label="AGB", robust=True)

plt.xlabel("max_trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-art-max_trials-robust.pdf')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in ak_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['art']['yhat'] for row in ak_items ], []))
yhat = yhat[m, :]

In [None]:
plot_scatter(y_test, yhat, 'lai', 'autokeras-scatter-art-multi-lai')

In [None]:
plot_scatter(y_test, yhat, 'cm', 'autokeras-scatter-art-multi-cm')

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autokeras-scatter-art-multi-agb')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in ak_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = np.reshape(y_test[m], (-1,1))
yhat = np.array(sum([ row['art']['yhat'] for row in ak_agb_items ], []))
yhat = yhat[m, :]

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autokeras-scatter-art-single-agb')

## auto-sklearn

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0,80])

df = all_df.query('tag == "norm" and model == "ask"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="train_time", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autosklearn-art-time.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0,80])

df = all_df.query('tag == "norm" and model == "ask_agb"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autosklearn-art-time-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and model == "ask"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autosklearn-art-time-agb.pdf')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in ask_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['art']['yhat'] for row in ask_items ], []))
yhat = yhat[m, :]

In [None]:
plot_scatter(y_test, yhat, 'lai', 'autosklearn-scatter-art-multi-lai')

In [None]:
plot_scatter(y_test, yhat, 'cm', 'autosklearn-scatter-art-multi-cm')

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autosklearn-scatter-art-multi-agb')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in ask_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = np.reshape(y_test[m], (-1,1))
yhat = np.array(sum([ row['art']['yhat'] for row in ask_agb_items ], []))
yhat = np.reshape(yhat[m], (-1,1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autosklearn-scatter-art-single-agb')

## Random forests

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([5, 25])

df = all_df.query('tag == "norm" and model == "rf"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="train_time", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/rf-art-time.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([5, 25])

df = all_df.query('tag == "norm" and model == "rf_agb"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/rf-art-time-single.pdf')

In [None]:
df = all_df[all_df.apply(lambda x: x['model'] == 'rf' and x['tag'] == 'norm' and x['method_specific']['x'][0] <= 100, axis=1)]

items = []
for (_, row) in df.iterrows():
    items.append([
        row['method_specific']['x'][0],
        row['art_test_mape_lai'],
        row['art_test_mape_cm'],
        row['art_test_mape_product'],
    ])
    
df = pd.DataFrame(data=items, columns=['n_estimators', 'art_test_mape_lai', 'art_test_mape_cm', 'art_test_mape_product'], copy=False)

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in rf_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['art']['yhat'] for row in rf_items ], []))
yhat = yhat[m, :]

In [None]:
items = []
for row in get_items('rf', None):
    param = row['art']['param']
    x = row['art']['method_specific']['x']
    score = row['art']['test_err']['mape']['product'] * 100
    (max_depth, min_samples_split, min_samples_leaf) = (x[0], x[1], x[2])
    items.append([ param, max_depth, min_samples_split, min_samples_leaf, score ])
    
df = pd.DataFrame(data=items, columns=['param', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'art_test_mape_product'], copy=False)
df

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([20, 22])
sns.lineplot(data=df, x='max_depth', y='art_test_mape_product')

plt.xlabel("max_depth")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/rf-max_depth-mape.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([20, 22])
sns.lineplot(data=df, x='min_samples_split', y='art_test_mape_product')

plt.xlabel("min_samples_split")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/rf-min_samples_split-mape.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([20, 22])
sns.lineplot(data=df, x='min_samples_leaf', y='art_test_mape_product')

plt.xlabel("min_samples_leaf")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/rf-min_samples_leaf-mape.pdf')

In [None]:
plot_scatter(y_test, yhat, 'lai', 'rf-scatter-art-multi-lai')

In [None]:
plot_scatter(y_test, yhat, 'cm', 'rf-scatter-art-multi-cm')

In [None]:
plot_scatter(y_test, yhat, 'agb', 'rf-scatter-art-multi-agb')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in rf_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = np.reshape(y_test[m], (-1,1))
yhat = np.array(sum([ row['art']['yhat'] for row in rf_agb_items ], []))
yhat = np.reshape(yhat[m], (-1,1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'rf-scatter-art-single-agb')

## Gaussian process

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 20])

df = all_df.query('tag == "norm" and model == "gp"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_lai", label="LAI")
sns.lineplot(data=df, x="train_time", y="art_test_mape_cm", label="cm")
sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/gp-art-time.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 20])

df = all_df.query('tag == "norm" and model == "gp_agb"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/gp-art-time-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and model == "gp"')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df, x="train_time", y="art_test_mape_product", label="AGB", color='green')

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/gp-art-time-agb.pdf')

In [None]:
rows = ([r['art']['param'], r['art']['method_specific']['iterations']] for r in get_items('gp', None))
df = pd.DataFrame(data=rows, columns=['max_trials', 'iterations'])

sns.scatterplot(data=df, x='max_trials', y='iterations')

plt.ylabel("Time taken (s)")
plt.ylabel("Iterations")

plt.savefig('graphs/gp-art-time-iterations.pdf')

In [None]:
df = all_df[all_df.apply(lambda x: x['model'] == 'gp' and x['tag'] == 'norm', axis=1)]

items = []
for (_, row) in df.iterrows():
    items.append([
        row['method_specific']['x'][-2],
        row['method_specific']['x'][0],
        row['art_test_mape_lai'],
        row['art_test_mape_cm'],
        row['art_test_mape_product'],
    ])
    
df = pd.DataFrame(data=items, columns=['n_estimators', 'kernel', 'art_test_mape_lai', 'art_test_mape_cm', 'art_test_mape_product'], copy=False)

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in gp_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['art']['yhat'] for row in gp_items ], []))
yhat = yhat[m, :]

In [None]:
items = []
for row in get_items('gp', None):
    param = row['art']['param']
    x = row['art']['method_specific']['x']
    score = row['art']['test_err']['mape']['product'] * 100
    (kernel_name, fixed, n_restarts, alpha, scale_x, normalize_y) = (x[0], x[1], x[2], x[3], x[4], x[5])
    items.append([ param, kernel_name, fixed, n_restarts, alpha, scale_x, normalize_y, score ])

df = pd.DataFrame(data=items, columns=['param', 'kernel_name', 'fixed', 'n_restarts', 'alpha', 'scale_x', 'normalize_y', 'art_test_mape_product'], copy=False)
df

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([10, 16])
sns.lineplot(data=df, x='n_restarts', y='art_test_mape_product')

plt.xlabel("Number of restarts")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/gp-n_restarts-mape.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([10, 16])
sns.lineplot(data=df, x='alpha', y='art_test_mape_product')

plt.xlabel("alpha")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/gp-alpha-mape.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([10, 16])
sns.boxplot(data=df, x='kernel_name', y='art_test_mape_product')

plt.xlabel("Kernel")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/gp-kernel_name-mape.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylim([10, 16])
sns.boxplot(data=df, x='normalize_y', y='art_test_mape_product')

plt.xlabel("Normalize y-value")
plt.ylabel("Mean absolute percentage error (%)")

plt.savefig('graphs/gp-normalize_y-mape.pdf')

In [None]:
plot_scatter(y_test, yhat, 'lai', 'gp-scatter-art-multi-lai')

In [None]:
plot_scatter(y_test, yhat, 'cm', 'gp-scatter-art-multi-cm')

In [None]:
plot_scatter(y_test, yhat, 'agb', 'gp-scatter-art-multi-agb')

In [None]:
n = len(ak_items[0]['art']['y_test'])

y_test = np.array(sum([ row['art']['y_test'] for row in gp_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = np.reshape(y_test[m], (-1,1))
yhat = np.array(sum([ row['art']['yhat'] for row in gp_agb_items ], []))
yhat = np.reshape(yhat[m], (-1,1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'gp-scatter-art-single-agb')

# Field Data

## Combined results

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
df = df.sort_values(by='model_name')
sns.scatterplot(x="train_time", y="field_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-field-time-scatter-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak" or model == "ask" or model == "gp" or model == "rf")')
df = df.sort_values(by='model_name')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(x="train_time", y="field_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-field-time-line-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak_agb" or model == "ask_agb" or model == "gp_agb" or model == "rf_agb")')
df = df.sort_values(by='model_name')
sns.scatterplot(x="train_time", y="field_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-field-time-scatter-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "norm" and (model == "ak_agb" or model == "ask_agb" or model == "gp_agb" or model == "rf_agb")')
df = df.sort_values(by='model_name')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(x="train_time", y="field_test_mape_product", data=df, hue='model_name')

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/all-field-time-line-single.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([0, 60*40])

ak_df = all_df.query('model == "ak" and tag == "simplenoise_0.5"')
sns.scatterplot(x="train_time", y="field_test_mape_product", data=ak_df, label="AutoKeras")
ak_df = None

ask_df = all_df.query('model == "ask" and tag == "simplenoise_0.5"')
sns.scatterplot(x="train_time", y="field_test_mape_product", data=ask_df, label="Auto-sklearn")
ask_df = None

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/automl-field-time-scatter.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([0, 60*40])

ak_df = all_df.query('model == "ak" and tag == "simplenoise_0.5"')
sns.regplot(x="train_time", y="field_test_mape_product", data=ak_df, label="AutoKeras", logx=True, scatter=False)
ak_df = None

ask_df = all_df.query('model == "ask" and tag == "simplenoise_0.5"')
sns.regplot(x="train_time", y="field_test_mape_product", data=ask_df, label="Auto-sklearn", logx=True, scatter=False)
ask_df = None

plt.xlabel("Time taken (s)")
plt.ylabel("AGB Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/automl-field-time-logx.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 420])

sns.stripplot(x="model_name", y="field_test_mape_product", data=baseline_df_multi.query('tag == "simplenoise_0.5"'), color='green', label='AGB')

plt.xlabel("Model")
plt.ylabel("Mean absolute percentage error (%)")

legend_without_duplicate_labels(ax)

plt.savefig('graphs/models-strip-field-multi.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.ylim([0, 420])

sns.stripplot(x="model_name", y="field_test_mape_product", data=baseline_df_single.query('tag == "simplenoise_0.5"'), color='green', label='AGB')

plt.xlabel("Model")
plt.ylabel("Mean absolute percentage error (%)")

legend_without_duplicate_labels(ax)

plt.savefig('graphs/models-strip-field-single.pdf')

In [None]:
out = map_df_table(
    baseline_df.query('tag == "simplenoise_0.5"'),
    [ ('field_test_mae_product', 'AGB [g cm$^{-2}$] MAE') ]
)
print(out.to_latex(bold_rows=True, na_rep='', escape=False), file = open('graphs/all-field.tex', 'w'))
out

In [None]:
out = map_df_table(
    baseline_df.query('tag == "simplenoise_0.5"'),
    [ ('field_test_mae_product', 'AGB [g cm$^{-2}$] MAE')
    , ('field_test_rmse_product', 'AGB [g cm$^{-2}$] RMSE')
    , ('field_test_mape_product', 'AGB [g cm$^{-2}$] MAPE')
    ]
)
print(out.to_latex(bold_rows=True, na_rep='', escape=False), file = open('graphs/all-field-full.tex', 'w'))
out

## AutoKeras

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.xlim([0, 10000])

df = all_df.query('tag == "simplenoise_0.5" and (model == "ak" or model == "ak_agb")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df.query('model == "ak"'), x="train_time", y="field_test_mape_product", label="Multi-output")
sns.lineplot(data=df.query('model == "ak_agb"'), x="train_time", y="field_test_mape_product", label="Single-output")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-field-time.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "simplenoise_0.5" and (model == "ak" or model == "ak_agb")')

sns.lineplot(data=df.query('model == "ak"'), x="param", y="field_test_mape_product", label="Multi-output")
sns.lineplot(data=df.query('model == "ak_agb"'), x="param", y="field_test_mape_product", label="Single-output")

plt.xlabel("Max trials")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autokeras-field-max_trials.pdf')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in ak_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in ak_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autokeras-scatter-field-multi-agb')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in ak_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in ak_agb_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autokeras-scatter-field-single-agb')

## auto-sklearn

In [None]:
print(y_test.shape, yhat.shape)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "simplenoise_0.5" and (model == "ask" or model == "ask_agb")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df.query('model == "ask"'), x="train_time", y="field_test_mape_product", label="Multi-output")
sns.lineplot(data=df.query('model == "ask_agb"'), x="train_time", y="field_test_mape_product", label="Single-output")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/autosklearn-field-time.pdf')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in ask_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in ask_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autosklearn-scatter-field-multi-agb')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in ask_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in ask_agb_items ], []))
yhat = np.reshape(yhat, (-1, 1))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'autosklearn-scatter-field-single-agb')

## Random forests

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "simplenoise_0.5" and (model == "rf" or model == "rf_agb")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df.query('model == "rf"'), x="train_time", y="field_test_mape_product", label="Multi-output")
sns.lineplot(data=df.query('model == "rf_agb"'), x="train_time", y="field_test_mape_product", label="Single-output")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/rf-field-time.pdf')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in rf_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in rf_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'rf-scatter-field-multi-agb')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in rf_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in rf_agb_items ], []))
yhat = np.reshape(yhat, (-1, 1))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'rf-scatter-field-single-agb')

## Gaussian process

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df = all_df.query('tag == "simplenoise_0.5" and (model == "gp" or model == "gp_agb")')
for i in df.index:
    df.at[i, 'train_time'] = round(all_df.iloc[i].train_time / 500) * 500

sns.lineplot(data=df.query('model == "gp"'), x="train_time", y="field_test_mape_product", label="Multi-output")
sns.lineplot(data=df.query('model == "gp_agb"'), x="train_time", y="field_test_mape_product", label="Single-output")

plt.xlabel("Time taken (s)")
plt.ylabel("Mean absolute percentage error (%)")
ax.legend()

plt.savefig('graphs/gp-field-time.pdf')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in gp_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in gp_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'gp-scatter-field-multi-agb')

In [None]:
n = len(ak_items[0]['field']['y_test'])

y_test = np.array(sum([ row['field']['y_test'] for row in gp_agb_items ], []))
m = np.random.choice(y_test.shape[0], n, replace=False)
y_test = y_test[m, :]
yhat = np.array(sum([ row['field']['yhat'] for row in gp_agb_items ], []))
yhat = np.reshape(np.prod(yhat[m, :], axis=1), (-1, 1))

In [None]:
plot_scatter(y_test, yhat, 'agb', 'gp-scatter-field-single-agb')