In [1]:
import pathlib
from typing import List, Dict

import gym
import gym_fsw  # noqa: F401
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lcs import Perception
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
from myst_nb import glue

from src.basic_rl import run_q_learning_alternating, run_r_learning_alternating, qlearning, rlearning
from src.decorators import repeat, get_from_cache_or_run
from src.diminishing_reward import common_metrics
from src.observation_wrappers import FSWObservationWrapper
from src.payoff_landscape import get_all_state_action, plot_payoff_landscape
from src.runner import run_experiments_alternating
from src.utils import build_plots_dir_path, build_cache_dir_path
from src.visualization import PLOT_DPI, diminishing_reward_colors

plt.ioff()  # turn off interactive plotting
plt.style.use('../../../src/phd.mplstyle')

root_dir = pathlib.Path().cwd().parent.parent.parent
cwd_dir = pathlib.Path().cwd()

plot_dir = build_plots_dir_path(root_dir) / cwd_dir.name
cache_dir = build_cache_dir_path(root_dir) / cwd_dir.name


def average_experiment_runs(run_df: pd.DataFrame) -> pd.DataFrame:
    return run_df.groupby(['agent', 'trial', 'phase']).mean().reset_index(level='phase')

def plot_pop_and_rho(df, trials, plot_filename=None):
    colors = diminishing_reward_colors()

    expl_df = df[df['phase'] == 'exploit']

    xmax = trials/2

    fig, axs = plt.subplots(2, 1, figsize=(18, 16))

    # Steps in trial plot
    for alg in ['ACS2', 'AACS2_v1', 'AACS2_v2', 'Q-Learning', 'R-Learning']:
        alg_df = expl_df.loc[alg]
        idx = pd.Index(name='exploit trial', data=np.arange(1, len(alg_df) + 1))
        alg_df.set_index(idx, inplace=True)

        alg_df['steps_in_trial'].rolling(window=10).mean().plot(ax=axs[0], label=alg, linewidth=2, color=colors[alg])

    axs[0].set_xlabel("Exploit trial")
    axs[0].set_xlim(1, 500)
    axs[0].xaxis.set_major_locator(MultipleLocator(50))
    axs[0].xaxis.set_minor_locator(MultipleLocator(10))
    axs[0].xaxis.set_major_formatter(FormatStrFormatter('%1.0f'))
    axs[0].xaxis.set_tick_params(which='major', size=10, width=2, direction='in')
    axs[0].xaxis.set_tick_params(which='minor', size=5, width=1, direction='in')

    axs[0].set_ylabel("Number of steps")
    axs[0].yaxis.set_major_locator(MultipleLocator(1))
    axs[0].yaxis.set_tick_params(which='major', size=10, width=2, direction='in')
    axs[0].yaxis.set_tick_params(which='minor', size=5, width=1, direction='in')

    axs[0].set_title('Steps in trial')
    axs[0].legend(loc='upper right', frameon=False)

    # Rho plot
    for alg in ['AACS2_v1', 'AACS2_v2', 'R-Learning']:
        alg_df = expl_df.loc[alg]
        idx = pd.Index(name='exploit trial', data=np.arange(1, len(alg_df) + 1))
        alg_df.set_index(idx, inplace=True)

        alg_df['rho'].plot(ax=axs[1], label=alg, linewidth=2, color=colors[alg])

    axs[1].set_xlim(0, xmax)
    axs[1].set_xlabel("Exploit trial")
    axs[1].xaxis.set_major_locator(MultipleLocator(500))
    axs[1].xaxis.set_minor_locator(MultipleLocator(100))
    axs[1].xaxis.set_major_formatter(FormatStrFormatter('%1.0f'))
    axs[1].xaxis.set_tick_params(which='major', size=10, width=2, direction='in')
    axs[1].xaxis.set_tick_params(which='minor', size=5, width=1, direction='in')

    axs[1].set_ylabel(r"$\mathregular{\rho}$")
    axs[1].yaxis.set_major_locator(MultipleLocator(2))
    axs[1].yaxis.set_minor_locator(MultipleLocator(1))
    axs[1].yaxis.set_tick_params(which='major', size=10, width=2, direction='in')
    axs[1].yaxis.set_tick_params(which='minor', size=5, width=1, direction='in')
    axs[1].set_ylim(0, 11)

    axs[1].set_title(r'Estimated average $\mathregular{\rho}$')

    if plot_filename:
            fig.savefig(plot_filename, dpi=PLOT_DPI, bbox_inches='tight')

    return fig

ImportError: cannot import name 'plot_pop_and_rho' from 'src.diminishing_reward' (/home/arc/Projects/pwr/phd/src/diminishing_reward.py)

# Experiment 2 - Finite-State-World environment

In [None]:
learning_rate = 0.5
discount_factor = 0.95
epsilon = 0.1
zeta = 0.0001


def fsw_env_provider():
    return FSWObservationWrapper(gym.make(f'fsw-10-v0'))


# Set ACS2/AACS2 configuration parameter dictionary
basic_cfg = {
    'perception_bits': 1,
    'possible_actions': 2,
    'do_ga': False,
    'beta': learning_rate,
    'epsilon': epsilon,
    'gamma': discount_factor,
    'zeta': zeta,
    'user_metrics_collector_fcn': common_metrics,
    'biased_exploration_prob': 0,
    'metrics_trial_freq': 1
}

NUM_EXPERIMENTS = 10
trials = 10_000


@get_from_cache_or_run(cache_path=f'{cache_dir}/fsw/acs2.dill')
@repeat(num_times=NUM_EXPERIMENTS)
def run_acs2_in_fsw():
    return run_experiments_alternating(fsw_env_provider, trials, basic_cfg)


@get_from_cache_or_run(cache_path=f'{cache_dir}/fsw/qlearning.dill')
def run_qlearning_in_fsw():
    fsw_env = fsw_env_provider()
    init_Q = np.zeros((fsw_env.observation_space.n, fsw_env.action_space.n))
    return run_q_learning_alternating(NUM_EXPERIMENTS, trials, fsw_env, epsilon, learning_rate, discount_factor,
                                      init_Q, perception_to_state_mapper=lambda p: int(p[0]))


@get_from_cache_or_run(cache_path=f'{cache_dir}/fsw/rlearning.dill')
def run_rlearning_in_fsw():
    fsw_env = fsw_env_provider()
    init_R = np.zeros((fsw_env.observation_space.n, fsw_env.action_space.n))
    return run_r_learning_alternating(NUM_EXPERIMENTS, trials, fsw_env, epsilon, learning_rate, zeta, init_R,
                                      perception_to_state_mapper=lambda p: int(p[0]))


# run computations
acs2_runs_details = run_acs2_in_fsw()
q_learning_metrics = run_qlearning_in_fsw()
r_learning_metrics = run_rlearning_in_fsw()

# average runs and create aggregated metrics data frame
acs2_metrics = [m_df for _, _, _, m_df in acs2_runs_details]

agg_df = pd.concat([
    average_experiment_runs(pd.concat(acs2_metrics)),
    average_experiment_runs(pd.DataFrame(q_learning_metrics)),
    average_experiment_runs(pd.DataFrame(r_learning_metrics))]
)

In [None]:
fsw_env = fsw_env_provider()
state_action = get_all_state_action(fsw_env.state_action())


def calculate_state_action_payoffs(state_actions: List, pop_acs2, pop_aacs2v1, pop_aacs2v2, Q, R) -> Dict:
    payoffs = {}

    for sa in state_actions:
        p = Perception((sa.state,))

        # ACS2
        acs2_match_set = pop_acs2.form_match_set(p)
        acs2_action_set = acs2_match_set.form_action_set(sa.action)

        # AACS2_v1
        aacs2v1_match_set = pop_aacs2v1.form_match_set(p)
        aacs2v1_action_set = aacs2v1_match_set.form_action_set(sa.action)

        # AACS2_v2
        aacs2v2_match_set = pop_aacs2v2.form_match_set(p)
        aacs2v2_action_set = aacs2v2_match_set.form_action_set(sa.action)

        # Check if all states are covered
        for alg, action_set in zip(['ACS2', 'AACS2_v1', 'AACS2_v2'],
                                   [acs2_action_set, aacs2v1_action_set,
                                    aacs2v2_action_set]):
            if len(action_set) == 0:
                print(f"WARN: No {alg} classifiers for perception: {p}, action: {sa.action}")

        payoffs[sa] = {
            'ACS2': np.mean(list(map(lambda cl: cl.r, acs2_action_set))),
            'AACS2_v1': np.mean(list(map(lambda cl: cl.r, aacs2v1_action_set))),
            'AACS2_v2': np.mean(list(map(lambda cl: cl.r, aacs2v2_action_set))),
            'Q-Learning': Q[int(sa.state), sa.action],
            'R-Learning': R[int(sa.state), sa.action]
        }

    return payoffs


# Take first of each algorithm population pass for presenting payoff landscape
pop_acs2, pop_aacs2v1, pop_aacs2v2, _ = acs2_runs_details[0]


@get_from_cache_or_run(cache_path=f'{cache_dir}/fsw/qlearning-single.dill')
def run_single_qlearning():
    Q_init = np.zeros((fsw_env.observation_space.n, fsw_env.action_space.n))
    Q, _ = qlearning(fsw_env, trials, Q_init, epsilon, learning_rate, discount_factor, perception_to_state_mapper=lambda p: int(p[0]))
    return Q


@get_from_cache_or_run(cache_path=f'{cache_dir}/fsw/rlearning-single.dill')
def run_single_rlearning():
    R_init = np.zeros((fsw_env.observation_space.n, fsw_env.action_space.n))
    R, rho, _ = rlearning(fsw_env, trials, R_init, epsilon, learning_rate, zeta, perception_to_state_mapper=lambda p: int(p[0]))
    return R, rho


Q = run_single_qlearning()
R, rho = run_single_rlearning()

payoffs = calculate_state_action_payoffs(state_action, pop_acs2, pop_aacs2v1, pop_aacs2v2, Q, R)

In [None]:
fsw_performance_fig = plot_pop_and_rho(agg_df, trials=trials, plot_filename=f'{plot_dir}/fsw-performance.png')
fsw_payoff_fig = plot_payoff_landscape(payoffs, rho=rho, rho_text_location={'x': 30, 'y': 60}, plot_filename=f'{plot_dir}/fsw-payoff-landscape.png')

glue('51-fsw-fig', fsw_performance_fig, display=False)
glue('51-fsw-payoff-fig', fsw_payoff_fig, display=False)

````{tabbed} Performance
```{glue:figure} 51-fsw-fig
:name: "51-fsw-fig"
Performance in FSW-20 environment
```
````

````{tabbed} Payoff Landscape
```{glue:figure} 51-fsw-payoff-fig
:name: "51-fsw-payoff-fig"
FSW-20 payoff landscape
```
````

## Statistical verification

```{admonition} Hypothesis testing
:class: tip
3 subplots (AACSv1, AACSv2, R-learning) for the distribution of of estimated average for 3 FSW sizes.
```

## Observations
Much more stable than Corridor (smaller MA)