In [2]:
import sys

# put custom scripts to module path
sys.path.append('../../..')

In [3]:
from lcs.agents.acs2 import Configuration, ACS2
from src.decorators import repeat, get_from_cache_or_run
from src.visualization import biased_exploration_colors, PLOT_DPI
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray

from typing import List

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import gym
import pathlib

NUM_EXPERIMENTS = 50
COLORS = biased_exploration_colors()

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'


def run_alternating_experiment(env_provider, trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    agent = ACS2(cfg)
    metrics = agent.explore_exploit(env, trials)

    # parse metrics
    lst = [[d['trial'], d['reward'], d['population'], d['reliable']] for d in metrics]
    df = pd.DataFrame(lst, columns=['trial', 'reward', 'population', 'reliable'])

    # df = df.set_index('trial')
    df['phase'] = df.index.map(lambda t: "explore" if t % 2 == 0 else "exploit")

    return df

def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')

# Experiment 2 - Single-step problem performance

## Setup

In [4]:
import gym_multiplexer  # noqa: F401
from src.observation_wrappers import BinnedObservationWrapper

rmpx_bits = 6  # available sizes: 3, 6
rmpx_discretization_bins = 6


def rmpx_metrics(agent, env):
    pop = agent.population
    metrics = {
        'reliable': len([cl for cl in pop if cl.is_reliable()])
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def rmpx_env_provider():
    return BinnedObservationWrapper(gym.make(f'real-multiplexer-{rmpx_bits}bit-v0'), rmpx_discretization_bins)


rmpx_base_params = {
    "classifier_length": rmpx_bits + 1,
    "number_of_possible_actions": 2,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,  # do not cross-over
    "mu": 0.03,
    "u_max": 4,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": rmpx_metrics
}

# Start experiments
rmpx_trials = 2 * 10000  # explore-exploit-explore-...


@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_epsilon_greedy():
    return run_alternating_experiment(rmpx_env_provider,
                                      rmpx_trials,
                                      **(rmpx_base_params | {'action_selector': EpsilonGreedy}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_action_delay():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_knowledge_array():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_oiq():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

def plot_rmpx(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, env_name, bins):

    def plot_by_phase(df, window, label, color, ax):
        explore_df = df[df['phase'] == 'explore']
        exploit_df = df[df['phase'] == 'exploit']

        explore_df.reset_index(inplace=True)
        exploit_df.reset_index(inplace=True)

        explore_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ls='--', alpha=0.2, ax=ax)
        exploit_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ax=ax)

    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2,1, hspace=.8)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of [{env_name}] environment discretized with {bins} bins', fontsize=24)

    # Each axis
    ma_window = 500  # moving average window

    # Average reward
    plot_by_phase(epsilon_greedy_df, ma_window, 'Epsilon Greedy', COLORS['eg'], ax1)
    plot_by_phase(action_delay_df, ma_window, 'Action Delay', COLORS['ad'], ax1)
    plot_by_phase(knowledge_array_df, ma_window, 'Knowledge Array', COLORS['ka'], ax1)
    plot_by_phase(op_initial_df, ma_window, 'Optimistic Initial Quality', COLORS['oiq'], ax1)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.set_title('Average Reward')
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Reward')
    ax1.set_ylim(300, 1050)
    ax1.axhline(y=1000, color='black', linewidth=1, linestyle="--")

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)

    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers count')
    ax2.set_title('Reliable Classifiers')
    ax2.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))
    ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(f'{plot_dir}/rmpx-performance.png', dpi=PLOT_DPI)

## Results

In [None]:
# Run calculations
# TODO: execute it 50 times (now only 5)
rmpx_epsilon_greedy_dfs = rmpx_epsilon_greedy()  # 360 sek / it
rmpx_action_delay_dfs = rmpx_action_delay()
rmpx_knowledge_array_dfs = rmpx_knowledge_array()
rmpx_oiq_dfs = rmpx_oiq()

# Plot results
plot_rmpx(
    average_experiment_runs(rmpx_epsilon_greedy_dfs),
    average_experiment_runs(rmpx_action_delay_dfs),
    average_experiment_runs(rmpx_knowledge_array_dfs),
    average_experiment_runs(rmpx_oiq_dfs),
    env_name=rmpx_env_provider().unwrapped.spec.id,
    bins=rmpx_discretization_bins
)

:::{figure-md} rmpx-fig
:class: full-width
<img src="plots/rmpx-performance.png">

Performance in rMPX environment
:::

## Statistical verification

```{admonition} Hypothesis testing
:class: tip
Novel idea would be to introduce a metric combining both average reward and number of classifiers (like _efficiency index_). Then such indicator might be used to plot the change when modifying number of bins. Otherwise a regular table might be used.
```

## Observations
...

---

**Software packages used**

In [6]:
import session_info
session_info.show()