In [2]:
import sys

# put custom scripts to module path
sys.path.append('../../..')

In [3]:
import logging

logging.basicConfig(level=logging.WARN, format='%(relativeCreated)6d %(threadName)s %(message)s')

import gym
from typing import List
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

import pathlib
from src.decorators import repeat, get_from_cache_or_run
from src.metrics import parse_experiments_results, corridor_transition_knowledge, grid_transition_knowledge

from lcs.agents.acs2 import Configuration, ACS2
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray

import gym_corridor  # noqa: F401
import gym_grid  # noqa: F401
import gym_multiplexer  # noqa: F401

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'

NUM_EXPERIMENTS = 50
PLOT_DPI = 200

# Specify unique colors for each line
cmap = plt.get_cmap('jet_r')
norm = plt.Normalize(vmin=0, vmax=4)

# ACTION SELECTION COLORS
COLORS = {
    "eg": cmap(norm(0)),
    "ad": cmap(norm(1)),
    "ka": cmap(norm(2)),
    "oiq": cmap(norm(3))
}


def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    explorer = ACS2(cfg)
    metrics_explore = explorer.explore(env, explore_trials)

    exploiter = ACS2(cfg, explorer.population)
    metrics_exploit = explorer.exploit(env, exploit_trials)

    # Parse results into DataFrame
    metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)

    return metrics_df


def run_alternating_experiment(env_provider, trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    agent = ACS2(cfg)
    metrics = agent.explore_exploit(env, trials)

    # parse metrics
    lst = [[d['trial'], d['reward'], d['population'], d['reliable']] for d in metrics]
    df = pd.DataFrame(lst, columns=['trial', 'reward', 'population', 'reliable'])

    # df = df.set_index('trial')
    df['phase'] = df.index.map(lambda t: "explore" if t % 2 == 0 else "exploit")

    return df


def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')


def plot(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df,
         env_name,
         num_explore_trials,
         first_knowledge_trials,
         first_population_trials,
         population_ylim,
         text_box_loc,
         plot_filename):
    fig = plt.figure(figsize=(14, 8))

    # Layout
    gs = fig.add_gridspec(2, 2, wspace=.25, hspace=.4)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[1, :])

    # Global title
    fig.suptitle(f'Performance of [{env_name}] environment', fontsize=24)

    # Knowledge
    epsilon_greedy_df['knowledge'][:first_knowledge_trials].plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax1)
    action_delay_df['knowledge'][:first_knowledge_trials].plot(label='Action Delay', c=COLORS['ad'], ax=ax1)
    knowledge_array_df['knowledge'][:first_knowledge_trials].plot(label='Knowledge Array', c=COLORS['ka'], ax=ax1)
    op_initial_df['knowledge'][:first_knowledge_trials].plot(label='Optimistic Initial Quality', c=COLORS['oiq'],
                                                             ax=ax1)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.set_title('Knowledge')
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Knowledge')
    ax1.axhline(y=100, color='black', linewidth=1, linestyle="--")
    ax1.yaxis.set_major_formatter(mtick.PercentFormatter())

    # Population
    epsilon_greedy_df['population'][:first_population_trials].plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['population'][:first_population_trials].plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['population'][:first_population_trials].plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['population'][:first_population_trials].plot(label='Optimistic Initial Quality', c=COLORS['oiq'],
                                                               ax=ax2)

    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers')
    ax2.set_title('Classifiers Population')
    ax2.set_ylim(population_ylim)
    ax2.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))
    ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))

    # Steps in trial
    window = 3  # window for moving average
    epsilon_greedy_df['steps_in_trial'].rolling(window=window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'],
                                                                           ax=ax3)
    action_delay_df['steps_in_trial'].rolling(window=window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax3)
    knowledge_array_df['steps_in_trial'].rolling(window=window).mean().plot(label='Knowledge Array', c=COLORS['ka'],
                                                                            ax=ax3)
    op_initial_df['steps_in_trial'].rolling(window=window).mean().plot(label='Optimistic Initial Quality',
                                                                       c=COLORS['oiq'], ax=ax3)

    ax3.spines['top'].set_visible(False)
    ax3.spines['right'].set_visible(False)
    ax3.set_xlabel('Trial')
    ax3.set_ylabel('Steps')
    ax3.set_title('Steps in trial')
    ax3.axvline(x=num_explore_trials, color='black', linewidth=1, linestyle="--")
    ax3.text(**text_box_loc, s=f'Moving average of {window} samples', style='italic',
             bbox={'facecolor': 'red', 'alpha': 0.2, 'pad': 10})

    # Create legend
    handles, labels = ax3.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(plot_filename, dpi=PLOT_DPI)

# Biased exploration
Chapter goes here. This one is the interactive notebook.

## Corridor

In [32]:
from src.observation_wrappers import CorridorObservationWrapper


# Function for calculating relevant metrics
def corridor_metrics(agent, env):
    pop = agent.population
    metrics = {
        'knowledge': corridor_transition_knowledge(pop, env)
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def corridor_env_provider():
    return CorridorObservationWrapper(gym.make(f'corridor-20-v0'))


corridor_base_params = {
    "classifier_length": 1,
    "number_of_possible_actions": 2,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "mu": 0.03,
    "u_max": 1,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": corridor_metrics
}

# Start experiments
corridor_explore_trials, corridor_exploit_trials = 60, 20


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_epsilon_greedy():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': EpsilonGreedy}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_action_delay():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': ActionDelay,
                                                     'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_knowledge_array():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': KnowledgeArray,
                                                     'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_oiq():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': EpsilonGreedy,
                                                     'biased_exploration_prob': 0.8}))


# Execute calculations
corridor_epsilon_greedy_dfs = corridor_epsilon_greedy()
corridor_action_delay_dfs = corridor_action_delay()
corridor_knowledge_array_dfs = corridor_knowledge_array()
corridor_oiq_dfs = corridor_oiq()

# Plot results
plot(
    average_experiment_runs(corridor_epsilon_greedy_dfs),
    average_experiment_runs(corridor_action_delay_dfs),
    average_experiment_runs(corridor_knowledge_array_dfs),
    average_experiment_runs(corridor_oiq_dfs),
    env_name='Corridor-20',
    num_explore_trials=corridor_explore_trials,
    first_knowledge_trials=30,
    first_population_trials=20,
    population_ylim=(17, 40),
    text_box_loc={"x": 63, "y": 120},
    plot_filename=f'{plot_dir}/corridor-performance.png'
)

:::{figure-md} corridor-fig
:class: full-width
<img src="plots/corridor-performance.png">

Performance in Corridor environment
:::

to jest tekst

## Grid

In [31]:
# Function for calculating relevant metrics
def grid_metrics(agent, env):
    pop = agent.population
    metrics = {
        'knowledge': grid_transition_knowledge(pop, env)
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def grid_env_provider():
    return gym.make(f'grid-20-v0')


grid_base_params = {
    "classifier_length": 2,
    "number_of_possible_actions": 4,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "mu": 0.03,
    "u_max": 1,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": grid_metrics
}

# Start experiments
grid_explore_trials, grid_exploit_trials = 60, 20


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_epsilon_greedy():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': EpsilonGreedy}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_action_delay():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_knowledge_array():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_oiq():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))


# Execute calculations
grid_epsilon_greedy_dfs = grid_epsilon_greedy()
grid_action_delay_dfs = grid_action_delay()
grid_knowledge_array_dfs = grid_knowledge_array()
grid_oiq_dfs = grid_oiq()

# Plot results
plot(
    average_experiment_runs(grid_epsilon_greedy_dfs),
    average_experiment_runs(grid_action_delay_dfs),
    average_experiment_runs(grid_knowledge_array_dfs),
    average_experiment_runs(grid_oiq_dfs),
    env_name='Grid-20',
    num_explore_trials=grid_explore_trials,
    first_knowledge_trials=10,
    first_population_trials=30,
    population_ylim=(70, 105),
    text_box_loc={"x": 63, "y": 1000},
    plot_filename=f'{plot_dir}/grid-performance.png'
)

:::{figure-md} grid-fig
:class: full-width
<img src="plots/grid-performance.png">

Performance in Grid environment
:::

to jest tekst2

## rMPX

In [30]:
from src.observation_wrappers import BinnedObservationWrapper

rmpx_bits = 6  # available sizes: 3, 6
rmpx_discretization_bins = 6


def rmpx_metrics(agent, env):
    pop = agent.population
    metrics = {
        'reliable': len([cl for cl in pop if cl.is_reliable()])
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def rmpx_env_provider():
    return BinnedObservationWrapper(gym.make(f'real-multiplexer-{rmpx_bits}bit-v0'), rmpx_discretization_bins)


rmpx_base_params = {
    "classifier_length": rmpx_bits + 1,
    "number_of_possible_actions": 2,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,  # do not cross-over
    "mu": 0.03,
    "u_max": 4,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": rmpx_metrics
}

# Start experiments
rmpx_trials = 2 * 10000  # explore-exploit-explore-...


@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_epsilon_greedy():
    return run_alternating_experiment(rmpx_env_provider,
                                      rmpx_trials,
                                      **(rmpx_base_params | {'action_selector': EpsilonGreedy}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_action_delay():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_knowledge_array():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/rmpx/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def rmpx_oiq():
    return run_alternating_experiment(rmpx_env_provider,
                          rmpx_trials,
                          **(rmpx_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

def plot_rmpx(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, env_name, bins):

    def plot_by_phase(df, window, label, color, ax):
        explore_df = df[df['phase'] == 'explore']
        exploit_df = df[df['phase'] == 'exploit']

        explore_df.reset_index(inplace=True)
        exploit_df.reset_index(inplace=True)

        explore_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ls='--', alpha=0.2, ax=ax)
        exploit_df['reward'].rolling(window=window).mean().plot(label=label, color=color, ax=ax)

    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2,1, hspace=.8)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of [{env_name}] environment discretized with {bins} bins', fontsize=24)

    # Each axis
    ma_window = 500  # moving average window

    # Average reward
    plot_by_phase(epsilon_greedy_df, ma_window, 'Epsilon Greedy', COLORS['eg'], ax1)
    plot_by_phase(action_delay_df, ma_window, 'Action Delay', COLORS['ad'], ax1)
    plot_by_phase(knowledge_array_df, ma_window, 'Knowledge Array', COLORS['ka'], ax1)
    plot_by_phase(op_initial_df, ma_window, 'Optimistic Initial Quality', COLORS['oiq'], ax1)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.set_title('Average Reward')
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Reward')
    ax1.set_ylim(300, 1050)
    ax1.axhline(y=1000, color='black', linewidth=1, linestyle="--")

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)

    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers count')
    ax2.set_title('Reliable Classifiers')
    ax2.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))
    ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(f'{plot_dir}/rmpx-performance.png', dpi=PLOT_DPI)

# Run calculations
# TODO: execute it 50 times (now only 5)
rmpx_epsilon_greedy_dfs = rmpx_epsilon_greedy()  # 360 sek / it
rmpx_action_delay_dfs = rmpx_action_delay()
rmpx_knowledge_array_dfs = rmpx_knowledge_array()
rmpx_oiq_dfs = rmpx_oiq()

# Plot results
plot_rmpx(
    average_experiment_runs(rmpx_epsilon_greedy_dfs),
    average_experiment_runs(rmpx_action_delay_dfs),
    average_experiment_runs(rmpx_knowledge_array_dfs),
    average_experiment_runs(rmpx_oiq_dfs),
    env_name=rmpx_env_provider().unwrapped.spec.id,
    bins=rmpx_discretization_bins
)

:::{figure-md} rmpx-fig
:class: full-width
<img src="plots/rmpx-performance.png">

Performance in rMPX environment
:::

## Cart Pole

In [29]:
cp_buckets = (1, 1, 6, 6)

class CartPoleObservationWrapper(gym.ObservationWrapper):
    # https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947
    # _high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
    # _low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
    def __init__(self, env, buckets):
        super().__init__(env)
        self._high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], 3500]
        self._low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -3500]
        self._buckets = buckets

    def observation(self, obs):
        ratios = [(obs[i] + abs(self._low[i])) / (self._high[i] - self._low[i]) for i in range(len(obs))]
        new_obs = [int(round((self._buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self._buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return [str(o) for o in new_obs]

def cp_env_provider():
    return CartPoleObservationWrapper(gym.make('CartPole-v0'), cp_buckets)

def cp_metrics(agent, env):
    pop = agent.population
    metrics = {}
    metrics.update(population_metrics(pop, env))

    return metrics

cp_base_params = {
    "classifier_length": 4,
    "number_of_possible_actions": 2,
    "epsilon": 0.9,
    "beta": 0.01,
    "gamma": 0.995,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,
    "mu": 0.03,
    "metrics_trial_frequency": 2,  # TODO note this
    "user_metrics_collector_fcn": cp_metrics
}


cp_explore_trials, cp_exploit_trials = 500, 500

CP_EXPERIMENTS = 2

@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_epsilon_greedy():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': EpsilonGreedy}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/action_delay.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_action_delay():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/knowledge_array.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_knowledge_array():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/oiq.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_oiq():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

def plot_cp(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, explore_trials, buckets):
    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2,1, hspace=.4)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of CartPole environment discretized with {buckets} buckets', fontsize=24)

    # Each axis
    ma_window = 5  # moving average window

    # Steps in trial
    epsilon_greedy_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax1)
    action_delay_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax1)
    knowledge_array_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'],ax=ax1)
    op_initial_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax1)

    ax1.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")
    ax1.axhline(y=195, color='black', linewidth=1, linestyle="--")

    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Steps')
    ax1.set_title(f'Steps in each trial')
    ax1.set_ylim(0, 200)

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)

    ax2.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")

    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers')
    ax2.set_title(f'Reliable classifiers')

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(f'{plot_dir}/cartpole-performance.png', dpi=PLOT_DPI)

# Run the calculations
cp_epsilon_greedy_dfs = cp_epsilon_greedy()
cp_action_delay_dfs = cp_action_delay()
cp_knowledge_array_dfs = cp_knowledge_array()
cp_oiq_dfs = cp_oiq()

# Plot result
plot_cp(
    average_experiment_runs(cp_epsilon_greedy_dfs),
    average_experiment_runs(cp_action_delay_dfs),
    average_experiment_runs(cp_knowledge_array_dfs),
    average_experiment_runs(cp_oiq_dfs),
    explore_trials=cp_explore_trials,
    buckets=cp_buckets)

:::{figure-md} cp-fig
:class: full-width
<img src="plots/cartpole-performance.png">

Performance in CartPole environment
:::

### Classifiers lookup

In [52]:
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy_single_run.dill')
def cp_single_run():
    cfg = Configuration(**(cp_base_params | {'action_selector': EpsilonGreedy}))
    agent = ACS2(cfg)
    agent.explore(cp_env_provider(), cp_explore_trials)
    return agent  # only interested in resulting population

# execute run
cp_agent = cp_single_run()

reliable = [cl for cl in cp_agent.population if cl.is_reliable()]
for cl in sorted(reliable, key=lambda cl: -cl.fitness):
    print(f'[{cl.condition} {cl.action} {cl.effect}]\t\tmark: {cl.mark}\tquality: {cl.q:.2f}\treward: {cl.r:.2f}\tnumerosity: {cl.num}')

[##23 0 ####]		mark: 00##	quality: 0.92	reward: 3.26	numerosity: 1
[##32 1 ####]		mark: 00##	quality: 0.91	reward: 3.20	numerosity: 1
[##22 1 ####]		mark: 00##	quality: 0.97	reward: 2.64	numerosity: 2
[##33 0 ####]		mark: 00##	quality: 0.96	reward: 2.36	numerosity: 1
[##12 0 ####]		mark: 00##	quality: 0.96	reward: 1.35	numerosity: 2
[##43 1 ####]		mark: 00##	quality: 0.98	reward: 1.31	numerosity: 1
[##12 1 ####]		mark: empty	quality: 1.00	reward: 1.27	numerosity: 18
[##43 0 ####]		mark: empty	quality: 0.99	reward: 1.26	numerosity: 20


---

**Software packages used**

In [53]:
import session_info
session_info.show()