In [7]:
import pathlib
from typing import List, Tuple, Dict

import gym
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML
from lcs.agents.acs2 import Configuration, ACS2
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray
from myst_nb import glue
from tabulate import tabulate

from src.bayes_estimation import bayes_estimate
from src.decorators import repeat, get_from_cache_or_run
from src.metrics import parse_experiments_results
from src.visualization import biased_exploration_colors, PLOT_DPI

COLORS = biased_exploration_colors()

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'


def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    explorer = ACS2(cfg)
    metrics_explore = explorer.explore(env, explore_trials)

    exploiter = ACS2(cfg, explorer.population)
    metrics_exploit = explorer.exploit(env, exploit_trials)

    # Parse results into DataFrame
    metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)

    return metrics_df


def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')


def plot_cp(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, explore_trials, buckets):
    fig = plt.figure(figsize=(14, 10))

    # Plots layout
    gs = fig.add_gridspec(2, 1, hspace=.4)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])

    # Global title
    fig.suptitle(f'Performance of CartPole environment discretized with {buckets} buckets', fontsize=24)

    # Each axis
    ma_window = 5  # moving average window

    # Steps in trial
    epsilon_greedy_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'],
                                                                              ax=ax1)
    action_delay_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'],
                                                                            ax=ax1)
    knowledge_array_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'],
                                                                               ax=ax1)
    op_initial_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality',
                                                                          c=COLORS['oiq'], ax=ax1)

    ax1.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")
    ax1.axhline(y=195, color='black', linewidth=1, linestyle="--")

    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Steps')
    ax1.set_title(f'Steps in each trial')
    ax1.set_ylim(0, 200)

    # Population
    epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
    action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
    knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'],
                                                                         ax=ax2)
    op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'],
                                                                    ax=ax2)

    ax2.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")

    ax2.set_xlabel('Trial')
    ax2.set_ylabel('Classifiers')
    ax2.set_title(f'Reliable classifiers')

    # Create legend
    handles, labels = ax2.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=4)

    # Save plot to file
    fig.savefig(f'{plot_dir}/cartpole-performance.png', dpi=PLOT_DPI)

# Experiment 3 - Balacing the pole

## Setup

In [8]:
class CartPoleObservationWrapper(gym.ObservationWrapper):
    # https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947
    # _high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
    # _low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
    def __init__(self, env, buckets):
        super().__init__(env)
        self._high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], 3500]
        self._low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -3500]
        self._buckets = buckets

    def observation(self, obs):
        ratios = [(obs[i] + abs(self._low[i])) / (self._high[i] - self._low[i]) for i in range(len(obs))]
        new_obs = [int(round((self._buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self._buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return [str(o) for o in new_obs]


def cp_env_provider(buckets: Tuple[int]):
    return CartPoleObservationWrapper(gym.make('CartPole-v0'), buckets)


def cp_metrics(agent, env):
    pop = agent.population
    metrics = {}
    metrics.update(population_metrics(pop, env))
    return metrics


cp_base_params = {
    "classifier_length": 4,
    "number_of_possible_actions": 2,
    "epsilon": 0.9,
    "beta": 0.01,
    "gamma": 0.995,
    "initial_q": 0.5,
    "theta_exp": 50,
    "theta_ga": 50,
    "do_ga": True,
    "chi": 0.0,
    "mu": 0.03,
    "metrics_trial_frequency": 1,
    "user_metrics_collector_fcn": cp_metrics
}

NUM_EXPERIMENTS = 50
USE_RAY = True

explore_trials, exploit_trials = 500, 500

# Bucket configurations
buckets_v1 = (1, 1, 6, 6)
buckets_v2 = (4, 4, 4, 4)
buckets_v3 = (2, 2, 6, 6)
buckets_v4 = (1, 2, 4, 4)
buckets_v5 = (1, 1, 8, 8)

def buckets_to_str(buckets, delimiter = '_'):
    return f'{delimiter.join(map(str, buckets))}'


def run_cart_pole_biased_exploration(buckets):
    env_provider = lambda: cp_env_provider(buckets)

    eg = run_experiment(env_provider,
                        explore_trials,
                        exploit_trials,
                        **(cp_base_params | {'action_selector': EpsilonGreedy}))

    ad = run_experiment(env_provider,
                        explore_trials,
                        exploit_trials,
                        **(cp_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))

    ka = run_experiment(env_provider,
                        explore_trials,
                        exploit_trials,
                        **(cp_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))

    oiq = run_experiment(env_provider,
                         explore_trials,
                         exploit_trials,
                         **(cp_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))

    return eg, ad, ka, oiq


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v1)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v1():
    return run_cart_pole_biased_exploration(buckets_v1)


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v2)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v2():
    return run_cart_pole_biased_exploration(buckets_v2)


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v3)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v3():
    return run_cart_pole_biased_exploration(buckets_v3)


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v4)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v4():
    return run_cart_pole_biased_exploration(buckets_v4)


@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v5)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v5():
    return run_cart_pole_biased_exploration(buckets_v5)

TypeError: buckets_to_str() missing 1 required positional argument: 'delimiter'

## Results

In [None]:
def extract(experiment_runs):
    eg_dfs, ad_dfs, ka_dfs, oiq_dfs = [], [], [], []

    for eg_df, ad_df, ka_df, oiq_df in experiment_runs:
        eg_dfs.append(eg_df)
        ad_dfs.append(ad_df)
        ka_dfs.append(ka_df)
        oiq_dfs.append(oiq_df)

    return eg_dfs, ad_dfs, ka_dfs, oiq_dfs


# Run the calculations
cp_bv1_eg_dfs, cp_bv1_ad_dfs, cp_bv1_ka_dfs, cp_bv1_oiq_dfs = extract(cp_buckets_v1())
cp_bv2_eg_dfs, cp_bv2_ad_dfs, cp_bv2_ka_dfs, cp_bv2_oiq_dfs = extract(cp_buckets_v2())
cp_bv3_eg_dfs, cp_bv3_ad_dfs, cp_bv3_ka_dfs, cp_bv3_oiq_dfs = extract(cp_buckets_v3())
cp_bv4_eg_dfs, cp_bv4_ad_dfs, cp_bv4_ka_dfs, cp_bv4_oiq_dfs = extract(cp_buckets_v4())
cp_bv5_eg_dfs, cp_bv5_ad_dfs, cp_bv5_ka_dfs, cp_bv5_oiq_dfs = extract(cp_buckets_v5())

# Plot visualization
plot_cp(
    average_experiment_runs(cp_bv1_eg_dfs),
    average_experiment_runs(cp_bv1_ad_dfs),
    average_experiment_runs(cp_bv1_ka_dfs),
    average_experiment_runs(cp_bv1_oiq_dfs),
    explore_trials=explore_trials,
    buckets=buckets_v1)

:::{figure-md} cp-fig
:class: full-width
<img src="plots/cartpole-performance.png">

Performance in CartPole environment
:::

### Classifiers lookup

In [None]:
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy_single_run.dill')
def cp_single_run():
    cfg = Configuration(**(cp_base_params | {'action_selector': EpsilonGreedy}))
    agent = ACS2(cfg)
    agent.explore(cp_env_provider(buckets_v1), explore_trials)
    return agent  # only interested in resulting population


# execute run
cp_agent = cp_single_run()

reliable = [cl for cl in cp_agent.population if cl.is_reliable()]
for cl in sorted(reliable, key=lambda cl: -cl.fitness):
    print(
        f'[{cl.condition} {cl.action} {cl.effect}]\t\tmark: {cl.mark}\tquality: {cl.q:.2f}\treward: {cl.r:.2f}\tnumerosity: {cl.num}')

## Statistical verification

```{admonition} Hypothesis testing
:class: tip
Here the best idea in my opinion would be to evaluate different discretization buckets combinations. But this requires a research and experiments on it own. Suggestion is to propose 4-5 reasonable configurations and distributions for the number of steps and classifiers count in each situation.
```

In [None]:
experiments_data = {
    buckets_v1: [cp_bv1_eg_dfs, cp_bv1_ad_dfs, cp_bv1_ka_dfs, cp_bv1_oiq_dfs],
    buckets_v2: [cp_bv2_eg_dfs, cp_bv2_ad_dfs, cp_bv2_ka_dfs, cp_bv2_oiq_dfs],
    buckets_v3: [cp_bv3_eg_dfs, cp_bv3_ad_dfs, cp_bv3_ka_dfs, cp_bv3_oiq_dfs],
    buckets_v4: [cp_bv4_eg_dfs, cp_bv4_ad_dfs, cp_bv4_ka_dfs, cp_bv4_oiq_dfs],
    buckets_v5: [cp_bv5_eg_dfs, cp_bv5_ad_dfs, cp_bv5_ka_dfs, cp_bv5_oiq_dfs]
}

def train_bayes_model(dfs, query_condition, field):
    data_arr = pd.concat(dfs).query(query_condition)[field].to_numpy()
    bayes_model = bayes_estimate(data_arr)
    return bayes_model['mu'], bayes_model['std']

def build_models(dfs: Dict, field: str, query_condition: str):
    results = {}

    for bucket, dfs in dfs.items():
        posteriors = [train_bayes_model(df, query_condition, field) for df in dfs]
        results[bucket] = posteriors

    return results

def print_bayes_table(data):
    table_data = [[buckets_to_str(bucket, ',')] + rewards for bucket, rewards in data.items()]

    table = tabulate(table_data,
                     headers=['', 'Epsilon Greedy', 'Action Delay', 'Knowledge Array', 'Optimistic Initial Quality'],
                     tablefmt="html", stralign='right', floatfmt=".2f")

    return HTML(table)

print_row = lambda r: f'{round(r[0].mean(), 2)} ± {round(r[0].std(), 2)}'

# Average Steps in exploit phase
avg_reward = lambda dfs: pd.concat(dfs).query('phase == "exploit"')['steps_in_trial'].mean()

average_rewards_data = {}
for bucket, dfs in experiments_data.items():
    average_rewards_data[bucket] = list(map(avg_reward, dfs))

# reliable classifiers
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/bayes/reliable.dill')
def build_reliable_models(dfs: Dict):
    return build_models(dfs, field='reliable', query_condition=f'trial == {explore_trials - 1}')

# run computations
reliable_data = build_reliable_models(experiments_data)

reliable_table_data = {}
for bucket, models in reliable_data.items():
    reliable_table_data[bucket] = list(map(print_row, models))

# Add glue objects
glue('average_steps', print_bayes_table(average_rewards_data), display=False)
glue('bayes_reliable_classifies', print_bayes_table(reliable_table_data), display=False)

```{tabbed} Average Number of Steps
{glue:}`average_steps`
```

```{tabbed} Reliable classifiers
{glue:}`bayes_reliable_classifies`
```

## Observations
Why bucketing was chosen by hand (problem with hyperparameter tuning)

---

**Software packages used**

In [None]:
import session_info

session_info.show()