In [None]:
import sys

# put custom scripts to module path
sys.path.append('../../..')

In [2]:
from lcs.agents.acs2 import Configuration, ACS2
from src.decorators import repeat, get_from_cache_or_run
from src.visualization import biased_exploration_colors, PLOT_DPI
from src.metrics import parse_experiments_results
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray

from typing import List

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import gym
import pathlib

NUM_EXPERIMENTS = 50
COLORS = biased_exploration_colors()

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'


def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    explorer = ACS2(cfg)
    metrics_explore = explorer.explore(env, explore_trials)

    exploiter = ACS2(cfg, explorer.population)
    metrics_exploit = explorer.exploit(env, exploit_trials)

    # Parse results into DataFrame
    metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)

    return metrics_df

def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')

# Experiment 3 - Balacing the pole

## Setup

In [4]:
cp_buckets = (1, 1, 6, 6)

class CartPoleObservationWrapper(gym.ObservationWrapper):
    # https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947
    # _high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
    # _low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
    def __init__(self, env, buckets):
        super().__init__(env)
        self._high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], 3500]
        self._low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -3500]
        self._buckets = buckets

    def observation(self, obs):
        ratios = [(obs[i] + abs(self._low[i])) / (self._high[i] - self._low[i]) for i in range(len(obs))]
        new_obs = [int(round((self._buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self._buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return [str(o) for o in new_obs]

def cp_env_provider():
    return CartPoleObservationWrapper(gym.make('CartPole-v0'), cp_buckets)

def cp_metrics(agent, env):
    pop = agent.population
    metrics = {}
    metrics.update(population_metrics(pop, env))

    return metrics

cp_base_params = {
    "classifier_length": 4,
    "number_of_possible_actions": 2,
    "epsilon": 0.9,
    "beta": 0.01,
    "gamma": 0.995,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,
    "mu": 0.03,
    "metrics_trial_frequency": 2,  # TODO note this
    "user_metrics_collector_fcn": cp_metrics
}


cp_explore_trials, cp_exploit_trials = 500, 500

CP_EXPERIMENTS = 2

@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_epsilon_greedy():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': EpsilonGreedy}))

@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/action_delay.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_action_delay():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/knowledge_array.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_knowledge_array():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/oiq.dill')
@repeat(num_times=CP_EXPERIMENTS)
def cp_oiq():
    return run_experiment(cp_env_provider,
                          cp_explore_trials,
                          cp_exploit_trials,
                          **(cp_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

def plot_cp(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, explore_trials, buckets):
    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2,1, hspace=.4)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of CartPole environment discretized with {buckets} buckets', fontsize=24)

    # Each axis
    ma_window = 5  # moving average window

    # Steps in trial
    epsilon_greedy_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax1)
    action_delay_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax1)
    knowledge_array_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'],ax=ax1)
    op_initial_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax1)

    ax1.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")
    ax1.axhline(y=195, color='black', linewidth=1, linestyle="--")

    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Steps')
    ax1.set_title(f'Steps in each trial')
    ax1.set_ylim(0, 200)

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)

    ax2.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")

    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers')
    ax2.set_title(f'Reliable classifiers')

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(f'{plot_dir}/cartpole-performance.png', dpi=PLOT_DPI)

## Results

In [None]:
# Run the calculations
cp_epsilon_greedy_dfs = cp_epsilon_greedy()
cp_action_delay_dfs = cp_action_delay()
cp_knowledge_array_dfs = cp_knowledge_array()
cp_oiq_dfs = cp_oiq()

# Plot visualization
plot_cp(
    average_experiment_runs(cp_epsilon_greedy_dfs),
    average_experiment_runs(cp_action_delay_dfs),
    average_experiment_runs(cp_knowledge_array_dfs),
    average_experiment_runs(cp_oiq_dfs),
    explore_trials=cp_explore_trials,
    buckets=cp_buckets)

:::{figure-md} cp-fig
:class: full-width
<img src="plots/cartpole-performance.png">

Performance in CartPole environment
:::

### Classifiers lookup

In [5]:
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy_single_run.dill')
def cp_single_run():
    cfg = Configuration(**(cp_base_params | {'action_selector': EpsilonGreedy}))
    agent = ACS2(cfg)
    agent.explore(cp_env_provider(), cp_explore_trials)
    return agent  # only interested in resulting population

# execute run
cp_agent = cp_single_run()

reliable = [cl for cl in cp_agent.population if cl.is_reliable()]
for cl in sorted(reliable, key=lambda cl: -cl.fitness):
    print(f'[{cl.condition} {cl.action} {cl.effect}]\t\tmark: {cl.mark}\tquality: {cl.q:.2f}\treward: {cl.r:.2f}\tnumerosity: {cl.num}')

[##23 0 ####]		mark: 00##	quality: 0.92	reward: 3.26	numerosity: 1
[##32 1 ####]		mark: 00##	quality: 0.91	reward: 3.20	numerosity: 1
[##22 1 ####]		mark: 00##	quality: 0.97	reward: 2.64	numerosity: 2
[##33 0 ####]		mark: 00##	quality: 0.96	reward: 2.36	numerosity: 1
[##12 0 ####]		mark: 00##	quality: 0.96	reward: 1.35	numerosity: 2
[##43 1 ####]		mark: 00##	quality: 0.98	reward: 1.31	numerosity: 1
[##12 1 ####]		mark: empty	quality: 1.00	reward: 1.27	numerosity: 18
[##43 0 ####]		mark: empty	quality: 0.99	reward: 1.26	numerosity: 20


## Statistical verification

```{admonition} Hypothesis testing
:class: tip
Here the best idea in my opinion would be to evaluate different discretization buckets combinations. But this requires a research and experiments on it own. Suggestion is to propose 4-5 reasonable configurations and distributions for the number of steps and classifiers count in each situation.
```

## Observations
...

---

**Software packages used**

In [1]:
import session_info
session_info.show()