In [3]:
import sys

# put custom scripts to module path
sys.path.append('../../..')

In [4]:
import logging

logging.basicConfig(level=logging.WARN, format='%(relativeCreated)6d %(threadName)s %(message)s')

import gym
from typing import List
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

import pathlib
from src.decorators import repeat, get_from_cache_or_run
from src.metrics import parse_experiments_results

from lcs.agents.acs2 import Configuration, ACS2
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray


import gym_multiplexer  # noqa: F401

plt.ioff()  # turn off interactive plotting

cache_dir = f'{pathlib.Path().absolute()}/cache'
plot_dir = f'{pathlib.Path().absolute()}/plots'

NUM_EXPERIMENTS = 50


def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    explorer = ACS2(cfg)
    metrics_explore = explorer.explore(env, explore_trials)

    exploiter = ACS2(cfg, explorer.population)
    metrics_exploit = explorer.exploit(env, exploit_trials)

    # Parse results into DataFrame
    metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)

    return metrics_df


def run_alternating_experiment(env_provider, trials, **conf):
    env = env_provider()
    env.reset()

    cfg = Configuration(**conf)

    agent = ACS2(cfg)
    metrics = agent.explore_exploit(env, trials)

    # parse metrics
    lst = [[d['trial'], d['reward'], d['population'], d['reliable']] for d in metrics]
    df = pd.DataFrame(lst, columns=['trial', 'reward', 'population', 'reliable'])

    # df = df.set_index('trial')
    df['phase'] = df.index.map(lambda t: "explore" if t % 2 == 0 else "exploit")

    return df


def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')


# Biased exploration

Increase the speed of gaining knowledge by determining the most valuable regions of the input-space. Tell why it is important for for the real-valued case. Describe the action selection, methods (epsilon-greedy, action delay, knowledge-array and OIQ - novel).

## Experimental evaluation
This section presents the motivation, goals and set-up of the performed experiments, as well as their results.

## Research questions
The conducted research aims to answer the following question regarding rACS algorithm and the interval based representation

1. Does the biased exploration methods (AD, KA, OIQ) have the significantly accelerate agent's learning speed?
2. Can the OIQ method improve the performance in terms of ingesting knowledge or reducing classifier population size?

## Goals of the experiments

```{admonition} _Experiment 1 - Performance in simple multi-steps environments (Corridor, Grid)_
Show the difference between EG, AD, KA and OIQ using the classifier population, knowledge and step size metrics using ACS2 agent.
```

```{admonition} _Experiment 2 - Performance in single-step rMPX_
Show the difference between EG, AD, KA and OIQ  using the average obtained reward and reliable classifiers count using ACS2 agent.
```

```{admonition} _Experiment 3 - Performance in demanding CartPole environment_
Show the difference between EG, AD, KA and OIQ  using the average obtained reward and reliable classifiers count using ACS2 agent.
```

## Experiments

```{tableofcontents}
```

## Answers to research questions
...

---

**Software packages used**

In [5]:
import session_info
session_info.show()