In [53]:
import pathlib
from typing import List, Dict

import gym
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
from IPython.display import HTML
from lcs.agents.acs2 import Configuration, ACS2
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray
from myst_nb import glue
from tabulate import tabulate

from src.bayes_estimation import bayes_estimate
from src.decorators import repeat, get_from_cache_or_run
from src.utils import build_plots_dir_path, build_cache_dir_path
from src.visualization import biased_exploration_colors, PLOT_DPI

COLORS = biased_exploration_colors()

plt.ioff()  # turn off interactive plotting

root_dir = pathlib.Path().cwd().parent.parent.parent
cwd_dir = pathlib.Path().cwd()

plot_dir = build_plots_dir_path(root_dir) / cwd_dir.name
cache_dir = build_cache_dir_path(root_dir) / cwd_dir.name


def run_alternating_experiment(env_provider, trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    agent = ACS2(cfg)
    metrics = agent.explore_exploit(env, trials)

    # parse metrics
    lst = [[d['trial'], d['reward'], d['population'], d['reliable']] for d in metrics]
    df = pd.DataFrame(lst, columns=['trial', 'reward', 'population', 'reliable'])

    # df = df.set_index('trial')
    df['phase'] = df.index.map(lambda t: "exploit" if t % 2 == 0 else "explore")

    return df


def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')


def plot_rmpx(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, env_name, bins, plot_filename=None):
    def plot_by_phase(df, window, label, color, ax):
        # manually renamed phases due to the bug in cached results
        explore_df = df[df['phase'] == 'exploit']
        exploit_df = df[df['phase'] == 'explore']

        explore_df.reset_index(inplace=True)
        exploit_df.reset_index(inplace=True)

        explore_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ls='--', alpha=0.2, ax=ax)
        exploit_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ax=ax)

    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2, 1, hspace=.8)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of [{env_name}] environment discretized with {bins} bins', fontsize=24)

    # Each axis
    ma_window = 500  # moving average window

    # Average reward
    plot_by_phase(epsilon_greedy_df, ma_window, 'Epsilon Greedy', COLORS['eg'], ax1)
    plot_by_phase(action_delay_df, ma_window, 'Action Delay', COLORS['ad'], ax1)
    plot_by_phase(knowledge_array_df, ma_window, 'Knowledge Array', COLORS['ka'], ax1)
    plot_by_phase(op_initial_df, ma_window, 'Optimistic Initial Quality', COLORS['oiq'], ax1)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.set_title('Average Reward')
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Reward')
    ax1.set_ylim(300, 1050)
    ax1.axhline(y=1000, color='black', linewidth=1, linestyle="--")

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)

    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers count')
    ax2.set_title('Reliable Classifiers')
    ax2.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))
    ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    if plot_filename:
        fig.savefig(plot_filename, dpi=PLOT_DPI)

    return fig

# Experiment 2 - Single-step problem performance

## Setup

In [22]:
from src.observation_wrappers import BinnedObservationWrapper

rmpx_bits = 6  # available sizes: 3, 6


def rmpx_metrics(agent, env):
    pop = agent.population
    metrics = {
        'reliable': len([cl for cl in pop if cl.is_reliable()])
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def rmpx_env_provider(bins):
    import gym_multiplexer  # noqa: F401
    return BinnedObservationWrapper(gym.make(f'real-multiplexer-{rmpx_bits}bit-v0'), bins)


rmpx_base_params = {
    "classifier_length": rmpx_bits + 1,
    "number_of_possible_actions": 2,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,
    "mu": 0.03,
    "u_max": 4,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": rmpx_metrics
}

# Start experiments
NUM_EXPERIMENTS = 50
USE_RAY = True

rmpx_trials = 2 * 15_000  # explore-exploit-explore-...

bins_v1 = 5
bins_v2 = 6
bins_v3 = 7


def run_rmpx_biased_exploration(bins):
    env_provider = lambda: rmpx_env_provider(bins)

    eg = run_alternating_experiment(env_provider, rmpx_trials, **(rmpx_base_params | {'action_selector': EpsilonGreedy}))
    ad = run_alternating_experiment(env_provider, rmpx_trials, **(rmpx_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))
    ka = run_alternating_experiment(env_provider, rmpx_trials, **(rmpx_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))
    oiq = run_alternating_experiment(env_provider, rmpx_trials, **(rmpx_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

    return eg, ad, ka, oiq


@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/bins_{bins_v1}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def rmpx_bins_v1():
    return run_rmpx_biased_exploration(bins_v1)


@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/bins_{bins_v2}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def rmpx_bins_v2():
    return run_rmpx_biased_exploration(bins_v2)


@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/bins_{bins_v3}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def rmpx_bins_v3():
    return run_rmpx_biased_exploration(bins_v3)

## Results

In [29]:
def extract(experiment_runs):
    eg_dfs, ad_dfs, ka_dfs, oiq_dfs = [], [], [], []

    for eg_df, ad_df, ka_df, oiq_df in experiment_runs:
        eg_dfs.append(eg_df)
        ad_dfs.append(ad_df)
        ka_dfs.append(ka_df)
        oiq_dfs.append(oiq_df)

    return eg_dfs, ad_dfs, ka_dfs, oiq_dfs


# Run calculations
for bins, run_output in zip(
        [bins_v1, bins_v2, bins_v3],
        [rmpx_bins_v1(), rmpx_bins_v2(), rmpx_bins_v3()]):

    eg_dfs, ad_dfs, ka_dfs, oiq_dfs = extract(run_output)
    plot_fig = plot_rmpx(
        average_experiment_runs(eg_dfs),
        average_experiment_runs(ad_dfs),
        average_experiment_runs(ka_dfs),
        average_experiment_runs(oiq_dfs),
        env_name=rmpx_env_provider(bins).unwrapped.spec.id,
        bins=bins,
        plot_filename=f'{plot_dir}/rmpx-{bins}-bins-performance.png'
    )
    glue(f'rmpx_{bins}_bins_fig', plot_fig, display=False)

````{tabbed} 5 bins discretization
```{glue:figure} rmpx_5_bins_fig
:name: "rmpx_5_bins_fig"
6 bit rMPX discretized with 5 bins
```
````

````{tabbed} 6 bins discretization
```{glue:figure} rmpx_6_bins_fig
:name: "rmpx_6_bins_fig"
6 bit rMPX discretized with 6 bins
```
````

````{tabbed} 7 bins discretization
```{glue:figure} rmpx_7_bins_fig
:name: "rmpx_7_bins_fig"
6 bit rMPX discretized with 7 bins
```
````

## Statistical verification

```{admonition} Hypothesis testing
:class: tip
Novel idea would be to introduce a metric combining both average reward and number of classifiers (like _efficiency index_). Then such indicator might be used to plot the change when modifying number of bins. Otherwise a regular table might be used.
```

In [58]:
def train_bayes_model(dfs, query_condition, field):
    data_arr = pd.concat(dfs).query(query_condition)[field].to_numpy()
    bayes_model = bayes_estimate(data_arr)
    return bayes_model['mu'], bayes_model['std']

def build_models(dfs: Dict, field: str, query_condition: str):
    results = {}

    for bins, dfs in dfs.items():
        posteriors = [train_bayes_model(df, query_condition, field) for df in dfs]
        results[bins] = posteriors

    return results

def print_bayes_table(data):
    table_data = [[f'{bins} bins'] + values for bins, values in data.items()]

    table = tabulate(table_data,
                     headers=['', 'Epsilon Greedy', 'Action Delay', 'Knowledge Array', 'Optimistic Initial Quality'],
                     tablefmt="html", stralign='right', floatfmt=".2f")

    return HTML(table)

print_row = lambda r: f'{round(r[0].mean(), 2)} ± {round(r[0].std(), 2)}'

experiments_data = {}
for bins, run_output in zip(
        [bins_v1, bins_v2, bins_v3],
        [rmpx_bins_v1(), rmpx_bins_v2(), rmpx_bins_v3()]):

    experiments_data[bins] = extract(run_output)

# Average reward
def get_average_reward(dfs: [pd.DataFrame], last_n_runs: int = 100):
    return pd.concat(dfs).query('phase == "explore"').groupby('trial').mean().iloc[-last_n_runs:]['reward'].mean()

average_rewards_data = {}
for bins, dfs in experiments_data.items():
    average_rewards_data[bins] = list(map(get_average_reward, dfs))

# Reliable classifiers
@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/bayes/reliable.dill')
def build_reliable_models(dfs: Dict):
    return build_models(dfs, field='reliable', query_condition=f'trial == {rmpx_trials}')

reliable_data = build_reliable_models(experiments_data)

reliable_table_data = {}
for bins, models in reliable_data.items():
    reliable_table_data[bins] = list(map(print_row, models))

# Add glue objects
glue('rmpx_average_reward', print_bayes_table(average_rewards_data), display=False)
glue('rmpx_bayes_reliable_classifies', print_bayes_table(reliable_table_data), display=False)

```{tabbed} Average Reward
{glue:}`rmpx_average_reward`
```

```{tabbed} Reliable classifiers
{glue:}`rmpx_bayes_reliable_classifies`
```

## Observations
...

---

**Software packages used**

In [26]:
import session_info

session_info.show()