In [6]:
import sys

# put custom scripts to module path
sys.path.append('../../..')

In [7]:
from lcs.agents.acs2 import Configuration, ACS2
from src.decorators import repeat, get_from_cache_or_run
from src.metrics import parse_experiments_results, corridor_transition_knowledge, grid_transition_knowledge
from src.visualization import biased_exploration_colors, PLOT_DPI
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray

from typing import List

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import gym
import pathlib

NUM_EXPERIMENTS = 50
COLORS = biased_exploration_colors()

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'


def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    explorer = ACS2(cfg)
    metrics_explore = explorer.explore(env, explore_trials)

    exploiter = ACS2(cfg, explorer.population)
    metrics_exploit = explorer.exploit(env, exploit_trials)

    # Parse results into DataFrame
    metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)

    return metrics_df


def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')


def plot(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df,
         env_name,
         num_explore_trials,
         first_knowledge_trials,
         first_population_trials,
         population_ylim,
         text_box_loc,
         plot_filename=None):

    fig = plt.figure(figsize=(14, 8))

    # Layout
    gs = fig.add_gridspec(2, 2, wspace=.25, hspace=.4)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[1, :])

    # Global title
    fig.suptitle(f'Performance of [{env_name}] environment', fontsize=24)

    # Knowledge
    epsilon_greedy_df['knowledge'][:first_knowledge_trials].plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax1)
    action_delay_df['knowledge'][:first_knowledge_trials].plot(label='Action Delay', c=COLORS['ad'], ax=ax1)
    knowledge_array_df['knowledge'][:first_knowledge_trials].plot(label='Knowledge Array', c=COLORS['ka'], ax=ax1)
    op_initial_df['knowledge'][:first_knowledge_trials].plot(label='Optimistic Initial Quality', c=COLORS['oiq'],
                                                             ax=ax1)

    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.set_title('Knowledge')
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Knowledge')
    ax1.axhline(y=100, color='black', linewidth=1, linestyle="--")
    ax1.yaxis.set_major_formatter(mtick.PercentFormatter())

    # Population
    epsilon_greedy_df['population'][:first_population_trials].plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['population'][:first_population_trials].plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['population'][:first_population_trials].plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['population'][:first_population_trials].plot(label='Optimistic Initial Quality', c=COLORS['oiq'],
                                                               ax=ax2)

    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers')
    ax2.set_title('Classifiers Population')
    ax2.set_ylim(population_ylim)
    ax2.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))
    ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f'))

    # Steps in trial
    window = 3  # window for moving average
    epsilon_greedy_df['steps_in_trial'].rolling(window=window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'],
                                                                           ax=ax3)
    action_delay_df['steps_in_trial'].rolling(window=window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax3)
    knowledge_array_df['steps_in_trial'].rolling(window=window).mean().plot(label='Knowledge Array', c=COLORS['ka'],
                                                                            ax=ax3)
    op_initial_df['steps_in_trial'].rolling(window=window).mean().plot(label='Optimistic Initial Quality',
                                                                       c=COLORS['oiq'], ax=ax3)

    ax3.spines['top'].set_visible(False)
    ax3.spines['right'].set_visible(False)
    ax3.set_xlabel('Trial')
    ax3.set_ylabel('Steps')
    ax3.set_title('Steps in trial')
    ax3.axvline(x=num_explore_trials, color='black', linewidth=1, linestyle="--")
    ax3.text(**text_box_loc, s=f'Moving average of {window} samples', style='italic',
             bbox={'facecolor': 'red', 'alpha': 0.2, 'pad': 10})

    # Create legend
    handles, labels = ax3.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    if plot_filename:
        fig.savefig(plot_filename, dpi=PLOT_DPI)

# Experiment 1 - Multi-steps problems performance

## Setup

### Corridor

In [8]:
import gym_corridor  # noqa: F401
from src.observation_wrappers import CorridorObservationWrapper

# Function for calculating relevant metrics
def corridor_metrics(agent, env):
    pop = agent.population
    metrics = {
        'knowledge': corridor_transition_knowledge(pop, env)
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def corridor_env_provider():
    return CorridorObservationWrapper(gym.make(f'corridor-20-v0'))


corridor_base_params = {
    "classifier_length": 1,
    "number_of_possible_actions": 2,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "mu": 0.03,
    "u_max": 1,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": corridor_metrics
}

# Start experiments
corridor_explore_trials, corridor_exploit_trials = 60, 20

@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_epsilon_greedy():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': EpsilonGreedy}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_action_delay():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': ActionDelay,
                                                     'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_knowledge_array():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': KnowledgeArray,
                                                     'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/corridor/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def corridor_oiq():
    return run_experiment(corridor_env_provider,
                          corridor_explore_trials,
                          corridor_exploit_trials,
                          **(corridor_base_params | {'action_selector': EpsilonGreedy,
                                                     'biased_exploration_prob': 0.8}))

### Grid

In [9]:
import gym_grid  # noqa: F401

# Function for calculating relevant metrics
def grid_metrics(agent, env):
    pop = agent.population
    metrics = {
        'knowledge': grid_transition_knowledge(pop, env)
    }
    metrics.update(population_metrics(pop, env))
    return metrics


def grid_env_provider():
    return gym.make(f'grid-20-v0')


grid_base_params = {
    "classifier_length": 2,
    "number_of_possible_actions": 4,
    "epsilon": 0.8,
    "beta": 0.2,
    "gamma": 0.95,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "mu": 0.03,
    "u_max": 1,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": grid_metrics
}

# Start experiments
grid_explore_trials, grid_exploit_trials = 60, 20


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/epsilon_greedy.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_epsilon_greedy():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': EpsilonGreedy}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/action_delay.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_action_delay():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/knowledge_array.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_knowledge_array():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/grid/oiq.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def grid_oiq():
    return run_experiment(grid_env_provider,
                          grid_explore_trials,
                          grid_exploit_trials,
                          **(grid_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))


## Results

In [10]:
# Execute calculations
corridor_epsilon_greedy_dfs = corridor_epsilon_greedy()
corridor_action_delay_dfs = corridor_action_delay()
corridor_knowledge_array_dfs = corridor_knowledge_array()
corridor_oiq_dfs = corridor_oiq()

# Plot results
plot(
    average_experiment_runs(corridor_epsilon_greedy_dfs),
    average_experiment_runs(corridor_action_delay_dfs),
    average_experiment_runs(corridor_knowledge_array_dfs),
    average_experiment_runs(corridor_oiq_dfs),
    env_name='Corridor-20',
    num_explore_trials=corridor_explore_trials,
    first_knowledge_trials=30,
    first_population_trials=20,
    population_ylim=(17, 40),
    text_box_loc={"x": 63, "y": 120},
    plot_filename=f'{plot_dir}/corridor-performance.png'
)

:::{figure-md} corridor-fig
:class: full-width
<img src="plots/corridor-performance.png">

Performance in Corridor environment
:::

In [11]:
# Execute calculations
grid_epsilon_greedy_dfs = grid_epsilon_greedy()
grid_action_delay_dfs = grid_action_delay()
grid_knowledge_array_dfs = grid_knowledge_array()
grid_oiq_dfs = grid_oiq()

# Plot results
plot(
    average_experiment_runs(grid_epsilon_greedy_dfs),
    average_experiment_runs(grid_action_delay_dfs),
    average_experiment_runs(grid_knowledge_array_dfs),
    average_experiment_runs(grid_oiq_dfs),
    env_name='Grid-20',
    num_explore_trials=grid_explore_trials,
    first_knowledge_trials=10,
    first_population_trials=30,
    population_ylim=(70, 105),
    text_box_loc={"x": 63, "y": 1000},
    plot_filename=f'{plot_dir}/grid-performance.png'
)

:::{figure-md} grid-fig
:class: full-width
<img src="plots/grid-performance.png">

Performance in Grid environment
:::

---

**Software packages used**

In [12]:
import session_info
session_info.show()