# Paper study - policy optimization

## Init Env

In [None]:
%load_ext autoreload
%autoreload 2

import os
import logging
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st

from settings import Metadata, EnvMetadata, ExperimentPhase, PROJECT_FOLDER, DATA_FOLDER, setup_matplotlib_config
from bin.main import run_simulation
from src.utils import lineplot_ci
from src.data_classes import Episode
from src.environment.channel.utils import create_K_MPR_matrix
from src.view.plot import plot_episode, plot_validation_metrics, plot_per_step_metrics, plot_validation_metrics_barplots, \
    plot_validation_metrics_per_n_steps_model_learning, \
    plot_p_transmit_aac, plot_critic_value_aac, plot_training_actor_critic, plot_coma_actors_critic, \
    plot_tdma_actors_intermediary_probabilities
from src.view.plot_model import plot_dirichlet_data_generation, plot_dirichlet_mpr_channel
from src.view.metrics import get_experiment_throughput, get_experiment_fairness, experiment_throughput_mean_dev, get_return, \
    get_buffer_info, get_channel_collisions, get_aac_info, get_training_info, get_state_distribution,  \
    get_coma_info, get_loss, get_critic_training_info, get_gradients_info, estimate_Q_value_COMA


setup_matplotlib_config()

root = logging.getLogger()
if root.handlers:
    for handler in root.handlers:
        root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO)

def does_experiment_exist(experiment_name):
    return os.path.isdir(os.path.join(PROJECT_FOLDER, DATA_FOLDER, experiment_name))


## Setup config

In [None]:
%autoreload 2

from experimental_setup import *

# Log
EXPERIMENT_NAME_PREFIX = "paper_policy_opt"

# Print
_JOINT_SINGLE_PACKET_GENERATED_PROBABILITY = DEFAULT_JOINT_DISTRIBUTION["1,0"]
_MARGINAL_DATA_GEN_PROBABILITY = _JOINT_SINGLE_PACKET_GENERATED_PROBABILITY / (1 + _JOINT_SINGLE_PACKET_GENERATED_PROBABILITY)
print(f"Marginal packet generation probability per agent : {_MARGINAL_DATA_GEN_PROBABILITY}")
print(f"Joint node distribution : {DEFAULT_JOINT_DISTRIBUTION}")
print(f"MPR Matrix : {MPR_MATRIX}")

## TDMA baselines

In [None]:
%autoreload 2

def get_metadata_tdma(agents_conf):
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": 0,
            "train_policy_max_steps": 0,
            "digital_twin_class": "DigitalTwinPolicyPassthrough",
            "digital_twin_kwargs": {},
            "policy_optimizer_class": "PolicyOptimizerTDMA",
            "policy_optimizer_kwargs": {
                "tdma_agents_config": agents_conf
            },
        }
    })


### 4 slots frames (no prior info on data generation process)

In [None]:
TDMA_4_SLOTS_AGENTS_CONF = [
    # Agents 1
    {
        "frame_length": TDMA_FRAME_LENGTH,
        "transmission_slot": 0,
        "transmission_probability": 1,
    },
    # Agent 2
    {
        "frame_length": TDMA_FRAME_LENGTH,
        "transmission_slot": 1,
        "transmission_probability": 1,
    },
    # Agents 3
    {
        "frame_length": TDMA_FRAME_LENGTH,
        "transmission_slot": 2,
        "transmission_probability": 1,
    },
    # Agent 4 
    {
        "frame_length": TDMA_FRAME_LENGTH,
        "transmission_slot": 3,
        "transmission_probability": 1,
    }
]

metadata_tdma_4_slots = get_metadata_tdma(TDMA_4_SLOTS_AGENTS_CONF)
# experiment_name_tdma_4_slots = f"{EXPERIMENT_NAME_PREFIX}_tdma_4_slots_01"

### 2 slots frames (prior info on data generation joint probability)

In [None]:
TDMA_2_SLOTS_AGENTS_CONF = [
    # Agents 1 and 2 in slot 1
    {
        "frame_length": 2,
        "transmission_slot": 0,
        "transmission_probability": 1,
    },
    {
        "frame_length": 2,
        "transmission_slot": 0,
        "transmission_probability": 1,
    },
    # Agents 3 and 4 in slot 2
    {
        "frame_length": 2,
        "transmission_slot": 1,
        "transmission_probability": 1,
    },
    {
        "frame_length": 2,
        "transmission_slot": 1,
        "transmission_probability": 1,
    }
]

metadata_tdma_2_slots = get_metadata_tdma(TDMA_2_SLOTS_AGENTS_CONF)

## Model-Free baseline (COMA)

In [None]:
N_TRAININGS_MODEL_FREE = 5
EXPERIMENT_NAME_TEST_EPISODES_MODEL_FREE = f"{EXPERIMENT_NAME_PREFIX}_model_free_01_test_episodes"

def get_metadata_model_free():
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinPolicyPassthrough",
            "digital_twin_kwargs": {},
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": N_EPISODES_TRAINING_POLICY,
            "train_policy_max_steps": N_STEPS_TRAINING_POLICY,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": POLICY_OPTIMIZER_METADATA,
        }
    })

def get_experiment_name_model_free(n_training):
    return f"{EXPERIMENT_NAME_PREFIX}_model_free_01_n_training_{n_training}"

metadata_model_free = get_metadata_model_free()

### Test episodes

In [None]:
%autoreload 2

run_simulation(
    metadata_tdma_4_slots,
    log=True,
    log_train=False,
    log_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_FREE,
    suffix_log_experiment_name=False
)

### Train and test model-free policy multiple times

In [None]:
%autoreload 2

for n_training in range(N_TRAININGS_MODEL_FREE):
    run_simulation(
        metadata_model_free,
        log=True,
        log_train=True,
        log_experiment_name=get_experiment_name_model_free(n_training),
        suffix_log_experiment_name=False,
        load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_FREE
    )

### Test TDMA baselines

In [None]:
# Evaluate TDMA baselines using the same test episodes

# TDMA 4 slots
experiment_name_tdma_4_slots_on_mf_test = run_simulation(
    metadata_tdma_4_slots,
    log=True,
    log_train=False,
    log_experiment_name=f"{EXPERIMENT_NAME_PREFIX}_tdma_4_slots_on_mf_test",
    suffix_log_experiment_name=True,
    load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_FREE
)



# TDMA 2 slots
experiment_name_tdma_2_slots_on_mf_test = run_simulation(
    metadata_tdma_2_slots,
    log=True,
    log_train=False,
    log_experiment_name=f"{EXPERIMENT_NAME_PREFIX}_tdma_2_slots_on_mf_test",
    suffix_log_experiment_name=True,
    load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_FREE
)

In [None]:
experiment_name_tdma_4_slots_on_mf_test = "paper_policy_opt_tdma_4_slots_on_mf_test_202208241633"
experiment_name_tdma_2_slots_on_mf_test = "paper_policy_opt_tdma_2_slots_on_mf_test_202208241633"
# Load model free test episodes for each training
experiments_model_free_on_mf_test = []
for n_training in range(N_TRAININGS_MODEL_FREE):
    experiments_model_free_on_mf_test.append(
        Episode.load_experiment(
            get_experiment_name_model_free(n_training),
            experiment_phase=ExperimentPhase.TEST_POLICY
        )
    )

# Load TDMA test episodes
experiment_tdma_4_slots_on_mf_test = Episode.load_experiment(
    experiment_name_tdma_4_slots_on_mf_test,
    experiment_phase=ExperimentPhase.TEST_POLICY
)
experiment_tdma_2_slots_on_mf_test = Episode.load_experiment(
    experiment_name_tdma_2_slots_on_mf_test,
    experiment_phase=ExperimentPhase.TEST_POLICY
)

### Plot model-free baseline against TDMA baselines

In [None]:
subplots = plt.subplots(1, 4)

plot_validation_metrics_barplots(
        subplots,
        0,
        [experiment_tdma_4_slots_on_mf_test],
        N_STEPS_RETURN_PLOT,
        RETURN_DISCOUNT,
        barplot_kwargs={"color": "tab:red"},
        n_packets_max=N_PACKETS_MAX,
        max_throughput=1.2
)

plot_validation_metrics_barplots(
        subplots,
        1,
        [experiment_tdma_2_slots_on_mf_test],
        N_STEPS_RETURN_PLOT,
        RETURN_DISCOUNT,
        barplot_kwargs={"color": "tab:green"},
        n_packets_max=N_PACKETS_MAX,
        max_throughput=1.2
)

plot_validation_metrics_barplots(
        subplots,
        2,
        experiments_model_free_on_mf_test,
        N_STEPS_RETURN_PLOT,
        RETURN_DISCOUNT,
        barplot_kwargs={"color": "tab:blue"},
        n_packets_max=N_PACKETS_MAX,
        max_throughput=1.2
)

## Train multiple models on random exploration policy

In [None]:
EXPERIMENT_VERSION = "08"  # Version of runs

### Test episodes

In [None]:
%autoreload 2

LOG_POLICY_TRAINING = False  # Logging training can take a lot of disk


def get_metadata_model_based_test_episodes():
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": 0,
            "train_policy_max_steps": 0,
            "digital_twin_class": "DigitalTwinPolicyPassthrough",
            "digital_twin_kwargs": {},
            "policy_optimizer_class": "PolicyOptimizerTDMA",
            "policy_optimizer_kwargs": {
                "tdma_agents_config": TDMA_4_SLOTS_AGENTS_CONF
            },
        }
    })

EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED = f"{EXPERIMENT_NAME_PREFIX}_model_based_test_episodes_{EXPERIMENT_VERSION}"

In [None]:
%autoreload 2

metadata_test_episodes = get_metadata_model_based_test_episodes()
run_simulation(
    metadata_test_episodes,
    log=True,
    log_train=False,
    log_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
    suffix_log_experiment_name=False
)

### Train models

In [None]:
N_MODELS_PER_STEP = 50
N_STEPS_INTERVAL = 1
N_STEPS_MAX = 20
N_STEPS_RANGE = list(range(0, N_STEPS_MAX+N_STEPS_INTERVAL, N_STEPS_INTERVAL))
RANGE_MODEL_STEPS = list(product(range(N_MODELS_PER_STEP), N_STEPS_RANGE))

def get_metadata_model_learning():
    return Metadata.from_dict({
        "env_metadata": {
            **ENV_METADATA,
            "test_n_episodes": 0,
        },
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinSeparateModel",
            "digital_twin_kwargs": {
                "n_packets_rollouts": N_PACKETS,
                "prior_dirichlet_concentration": PRIOR_DIRICHLET_CONCETRATION,
                "model_sampling_method": "posterior_sample",
                "exploration_policy_type": "random"
            },
            "train_model_n_episodes": 1,
            "train_model_max_steps": N_STEPS_MAX,
            "train_policy_n_episodes": 0,
            "train_policy_max_steps": 0,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": POLICY_OPTIMIZER_METADATA
        }
    })


def get_model_learning_experiment_name(n_model, n_steps, output_experiment_name_prefix=False):
    n_steps_suffix = "" if output_experiment_name_prefix else f"_n_steps_{n_steps}"
    return f"{EXPERIMENT_NAME_PREFIX}_model_learning_random_{EXPERIMENT_VERSION}_n_model_{n_model}{n_steps_suffix}"

In [None]:
%autoreload 2

for n_model in range(N_MODELS_PER_STEP):
    skip_model = False
    metadata_model_learning = get_metadata_model_learning()
    experiment_name_model_learning_prefix = get_model_learning_experiment_name(n_model, None, output_experiment_name_prefix=True)
    for n_steps in range(N_STEPS_MAX):
        experiment_name = f"{experiment_name_model_learning_prefix}_n_steps_{n_steps}"
        if does_experiment_exist(experiment_name) and (not skip_model):
            skip_model = True
            print(f"""
                Experiment '{experiment_name}' already done... 
                Skipping models with prefix '{experiment_name_model_learning_prefix}' !
            """)
    if not skip_model:
        run_simulation(
            metadata_model_learning,
            log=True,
            log_train=True,
            log_experiment_name=experiment_name_model_learning_prefix,
            suffix_log_experiment_name=False,
            log_model_at_each_step=True
        )

In [None]:
episode_max_steps = Episode.load_experiment(
    get_model_learning_experiment_name(0, None, output_experiment_name_prefix=True),
    experiment_phase=ExperimentPhase.TRAIN_MODEL
)[0]
plot_dirichlet_data_generation(
    episode_max_steps,
    "cluster_1",
    prior_dirichlet_concentration=PRIOR_DIRICHLET_CONCETRATION,
    prior_dirichlet_concentration_map=PRIOR_DIRICHLET_CONCETRATION_MAP,
    n_steps_per_plot=N_STEPS_INTERVAL,
    true_transition_probability={"": DEFAULT_JOINT_DISTRIBUTION},
    x_axis_range=[0, 1],
    x_axis_step=0.001
)
plot_dirichlet_data_generation(
    episode_max_steps,
    "cluster_2",
    prior_dirichlet_concentration=PRIOR_DIRICHLET_CONCETRATION,
    prior_dirichlet_concentration_map=PRIOR_DIRICHLET_CONCETRATION_MAP,
    n_steps_per_plot=N_STEPS_INTERVAL,
    true_transition_probability={"": DEFAULT_JOINT_DISTRIBUTION},
    x_axis_range=[0, 1],
    x_axis_step=0.001
)
plot_dirichlet_mpr_channel(
    episode_max_steps,
    prior_dirichlet_concentration=PRIOR_DIRICHLET_CONCETRATION,
    prior_dirichlet_concentration_map=PRIOR_DIRICHLET_CONCETRATION_MAP,
    n_steps_per_plot=N_STEPS_INTERVAL,
    max_packets_transmitted=4,
    true_mpr_matrix=MPR_MATRIX
)

## Optimize policies on model

### Model as posterior sample from bayesian estimator

In [None]:
def get_metadata_policy_opt_on_posterior_model(lr_actor=POLICY_OPTIMIZER_METADATA["learning_rate_actor"]):
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinSeparateModel",
            "digital_twin_kwargs": {
                "n_packets_rollouts": N_PACKETS,
                "prior_dirichlet_concentration": PRIOR_DIRICHLET_CONCETRATION,
                "model_sampling_method": "posterior_sample",
                "n_steps_between_model_update": N_STEPS_BETWEEN_POSTERIOR_SAMPLE,
                "exploration_policy_type": "random"
            },
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": N_EPISODES_TRAINING_POLICY,
            "train_policy_max_steps": N_STEPS_TRAINING_POLICY,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": {
                **POLICY_OPTIMIZER_METADATA,
                "learning_rate_actor": lr_actor
            }
        }
    })

def get_experiment_name_policy_opt_on_posterior_model(n_model, n_steps):
    return f"{EXPERIMENT_NAME_PREFIX}_policy_opt_on_posterior_model_{EXPERIMENT_VERSION}_n_model_{n_model}_n_steps_{n_steps}"

In [None]:
%autoreload 2

for n_model, n_steps in RANGE_MODEL_STEPS:
    policy_opt_experiment_name = get_experiment_name_policy_opt_on_posterior_model(n_model, n_steps)
    model_experiment_name = get_model_learning_experiment_name(n_model, n_steps)
    metadata = get_metadata_policy_opt_on_posterior_model()
    if does_experiment_exist(policy_opt_experiment_name):
        print(f"Experiment '{policy_opt_experiment_name}' already done...")
    else:
        try:
            run_simulation(
                metadata,
                log=True,
                log_train=LOG_POLICY_TRAINING,
                log_trained_model_or_policy=True,
                log_experiment_name=policy_opt_experiment_name,
                load_model_experiment_name=model_experiment_name,
                load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                suffix_log_experiment_name=False
            )
        except:
            print("ERROR ! Trying with half the learning rate for the actor...")
            if does_experiment_exist(policy_opt_experiment_name):
                os.rename(
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, policy_opt_experiment_name),
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{policy_opt_experiment_name}_ERROR"),
                )
            new_actor_lr = POLICY_OPTIMIZER_METADATA["learning_rate_actor"] / 2
            metadata = get_metadata_policy_opt_on_posterior_model(new_actor_lr)
            run_simulation(
                metadata,
                log=True,
                log_train=LOG_POLICY_TRAINING,
                log_trained_model_or_policy=True,
                log_experiment_name=policy_opt_experiment_name,
                load_model_experiment_name=model_experiment_name,
                load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                suffix_log_experiment_name=False
            )

### Model as maximum likelihood estimator

In [None]:
def get_metadata_policy_opt_on_max_likelihood_model(lr_actor=POLICY_OPTIMIZER_METADATA["learning_rate_actor"]):
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinSeparateModel",
            "digital_twin_kwargs": {
                "n_packets_rollouts": N_PACKETS,
                "prior_dirichlet_concentration": PRIOR_DIRICHLET_CONCETRATION,
                "model_sampling_method": "maximum_likelihood",
                "exploration_policy_type": "random"
            },
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": N_EPISODES_TRAINING_POLICY,
            "train_policy_max_steps": N_STEPS_TRAINING_POLICY,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": {
                **POLICY_OPTIMIZER_METADATA,
                "learning_rate_actor": lr_actor
            }
        }
    })

def get_experiment_name_policy_opt_on_max_likelihood_model(n_model, n_steps):
    return f"{EXPERIMENT_NAME_PREFIX}_policy_opt_on_max_likelihood_model_{EXPERIMENT_VERSION}_n_model_{n_model}_n_steps_{n_steps}"

In [None]:
for n_model, n_steps in RANGE_MODEL_STEPS:
    policy_opt_experiment_name = get_experiment_name_policy_opt_on_max_likelihood_model(n_model, n_steps)
    model_experiment_name = get_model_learning_experiment_name(n_model, n_steps)
    metadata = get_metadata_policy_opt_on_max_likelihood_model()
    if does_experiment_exist(policy_opt_experiment_name):
        print(f"Experiment '{policy_opt_experiment_name}' already done...")
    else:
        try:
            run_simulation(
                metadata,
                log=True,
                log_train=LOG_POLICY_TRAINING,
                log_trained_model_or_policy=True,
                log_experiment_name=policy_opt_experiment_name,
                load_model_experiment_name=model_experiment_name,
                load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                suffix_log_experiment_name=False
            )
        except:
            print("ERROR ! Trying with half the learning rate for the actor...")
            if does_experiment_exist(policy_opt_experiment_name):
                os.rename(
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, policy_opt_experiment_name),
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{policy_opt_experiment_name}_ERROR"),
                )
            new_actor_lr = POLICY_OPTIMIZER_METADATA["learning_rate_actor"] / 2
            metadata = get_metadata_policy_opt_on_max_likelihood_model(new_actor_lr)
            run_simulation(
                metadata,
                log=True,
                log_train=LOG_POLICY_TRAINING,
                log_trained_model_or_policy=True,
                log_experiment_name=policy_opt_experiment_name,
                load_model_experiment_name=model_experiment_name,
                load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                suffix_log_experiment_name=False
            )

### Model as maximum a posteriori estimator (MAP)

In [None]:
def get_metadata_policy_opt_on_map_model(lr_actor=POLICY_OPTIMIZER_METADATA["learning_rate_actor"]):
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinSeparateModel",
            "digital_twin_kwargs": {
                "n_packets_rollouts": N_PACKETS,
                "prior_dirichlet_concentration": PRIOR_DIRICHLET_CONCETRATION_MAP,
                "model_sampling_method": "maximum_a_posteriori",
                "exploration_policy_type": "random"
            },
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": N_EPISODES_TRAINING_POLICY,
            "train_policy_max_steps": N_STEPS_TRAINING_POLICY,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": {
                **POLICY_OPTIMIZER_METADATA,
                "learning_rate_actor": lr_actor
            }
        }
    })

def get_experiment_name_policy_opt_on_map_model(n_model, n_steps):
    return f"{EXPERIMENT_NAME_PREFIX}_policy_opt_on_map_model_{EXPERIMENT_VERSION}_n_model_{n_model}_n_steps_{n_steps}"

In [None]:
for n_model, n_steps in RANGE_MODEL_STEPS:
    policy_opt_experiment_name = get_experiment_name_policy_opt_on_map_model(n_model, n_steps)
    model_experiment_name = get_model_learning_experiment_name(n_model, n_steps)
    metadata = get_metadata_policy_opt_on_map_model()
    if does_experiment_exist(policy_opt_experiment_name):
        print(f"Experiment '{policy_opt_experiment_name}' already done...")
    else:
        try:
            run_simulation(
                metadata,
                log=True,
                log_train=LOG_POLICY_TRAINING,
                log_trained_model_or_policy=True,
                log_experiment_name=policy_opt_experiment_name,
                load_model_experiment_name=model_experiment_name,
                load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                suffix_log_experiment_name=False
            )
        except Exception as e:
            print(e)
            print("ERROR ! Trying with half the learning rate for the actor...")
            if does_experiment_exist(policy_opt_experiment_name):
                os.rename(
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, policy_opt_experiment_name),
                    os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{policy_opt_experiment_name}_ERROR"),
                )
            new_actor_lr = POLICY_OPTIMIZER_METADATA["learning_rate_actor"] / 2
            metadata = get_metadata_policy_opt_on_map_model(new_actor_lr)
            retry = True
            for n_attempt in range(5):  # 5 attempts with halved learning rate
                if retry:
                    try:
                        run_simulation(
                            metadata,
                            log=True,
                            log_train=LOG_POLICY_TRAINING,
                            log_trained_model_or_policy=True,
                            log_experiment_name=policy_opt_experiment_name,
                            load_model_experiment_name=model_experiment_name,
                            load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
                            suffix_log_experiment_name=False
                        )
                        retry = False
                    except Exception as e:
                        print(e)
                        print(f"ERROR ! Retrying with halved learning rate for the actor (attempt {n_attempt})...")

In [None]:
l = 
a = []

### Model free baseline

In [None]:
N_TRAININGS_MODEL_FREE_BASELINE = 50

def get_metadata_model_free_baseline():
    return Metadata.from_dict({
        "env_metadata": ENV_METADATA,
        # Policy
        "train_metadata": {
            "digital_twin_class": "DigitalTwinPolicyPassthrough",
            "digital_twin_kwargs": {},
            "train_model_n_episodes": 0,
            "train_model_max_steps": 0,
            "train_policy_n_episodes": N_EPISODES_TRAINING_POLICY,
            "train_policy_max_steps": N_STEPS_TRAINING_POLICY,
            "policy_optimizer_class": "PolicyOptimizerCOMA",
            "policy_optimizer_kwargs": POLICY_OPTIMIZER_METADATA,
        }
    })

def get_experiment_name_model_free_baseline(n_training):
    return f"{EXPERIMENT_NAME_PREFIX}_policy_opt_model_free_baseline_{EXPERIMENT_VERSION}_n_training_{n_training}"


In [None]:
for n_training in range(N_TRAININGS_MODEL_FREE_BASELINE):
    policy_opt_experiment_name = get_experiment_name_model_free_baseline(n_training)
    metadata = get_metadata_model_free_baseline()
    if does_experiment_exist(policy_opt_experiment_name):
        print(f"Experiment '{policy_opt_experiment_name}' already done...")
    else:
        run_simulation(
            metadata,
            log=True,
            log_train=True,
            log_experiment_name=policy_opt_experiment_name,
            load_forced_test_experiment_name=EXPERIMENT_NAME_TEST_EPISODES_MODEL_BASED,
            suffix_log_experiment_name=False
        )

## Plot performance Metrics

### Load experiments functions

In [None]:
SELECT_EPISODES = ["ep_0"]

LIGHT_SELECTION = {
    "rewards": True,
    "actions": True,
    "info": {
        "buffer_overflow": True,
        "channel_collision": True,
    },
    "state": True,
    "digital_twin_info": {},
    "train_info": {}
}

def load_model_free_test_experiments():
    model_free_test_episodes = [
        Episode.load_experiment(
            get_experiment_name_model_free_baseline(n_training),
            experiment_phase=ExperimentPhase.TEST_POLICY,
            light_selection=LIGHT_SELECTION,
            select_episodes=SELECT_EPISODES
        )
        for n_training in range(N_TRAININGS_MODEL_FREE_BASELINE)
    ] * len(N_STEPS_RANGE)
    print("Model free test episodes loaded !")
    return model_free_test_episodes

def load_posterior_test_experiments_generator():
    for n_model, n_steps in RANGE_MODEL_STEPS:
        yield Episode.load_experiment(
            get_experiment_name_policy_opt_on_posterior_model(n_model, n_steps),
            experiment_phase=ExperimentPhase.TEST_POLICY,
            light_selection=LIGHT_SELECTION,
            select_episodes=SELECT_EPISODES
        )
        if n_steps == N_STEPS_MAX:
            print(f"Posterior model {n_model} loaded...")

def load_frequentist_test_experiments_generator(map_estimator=True):
    get_experiment_name_func = (
        get_experiment_name_policy_opt_on_map_model if map_estimator
        else get_experiment_name_policy_opt_on_max_likelihood_model
    )
    for n_model, n_steps in RANGE_MODEL_STEPS:
        yield Episode.load_experiment(
            get_experiment_name_func(n_model, n_steps),
            experiment_phase=ExperimentPhase.TEST_POLICY,
            light_selection=LIGHT_SELECTION,
            select_episodes=SELECT_EPISODES
        )
        if n_steps == N_STEPS_MAX:
            print(f"{'MAP' if map_estimator else 'Max likelihood'} model {n_model} loaded...")


### Plot functions

In [None]:
USE_MAP_ESTIMATOR = True

INDIVIDUAL_PLOTS = [
    (
        plt.subplots(1, 1),
        {
            "plot_return": i == 0,
            "plot_throughput": i == 1,
            "plot_buffer": i == 2,
            "plot_collision": i == 3,
            "plot_overflow": i == 4,
        }
    )
    for i in range(5)
]

PLOT_KWARGS ={
    "plot_buffer_max": False,
    "range_return": [-1700, 600],
    "range_throughput": [-0.1, 1.15],
    "range_buffer": [55, 105],
    "range_overflow": [0.1, 0.45],
    "range_collision": [0, 0.5],
}

def load_experiments_and_compute_plots():
    # Plot arguments
    # --------------
    plot_args = [
        N_STEPS_RETURN_PLOT,
        RETURN_DISCOUNT,
        N_PACKETS_MAX
    ]
    n_steps_range_model_free = []
    for n_steps in N_STEPS_RANGE:
        n_steps_range_model_free += ([n_steps] * N_TRAININGS_MODEL_FREE_BASELINE)
    n_steps_range_all_models = [n_steps for n_model, n_steps in RANGE_MODEL_STEPS]


    # MODEL FREE BASELINE
    # -------------------
    print("PLOTTING MODEL FREE BASELINE")
    print("----------------------------")
    print("Loading test experiments")
    experiments_policy_opt_model_free = load_model_free_test_experiments()
    print("Getting plots")
    for subplot, plot_selection_kwargs in INDIVIDUAL_PLOTS:
        plot_validation_metrics_per_n_steps_model_learning(
            subplot,
            n_steps_range_model_free,
            experiments_policy_opt_model_free,
            *plot_args,
            lineplot_kwargs={"color": "blue"},
            annotate_x_pos=12,
            annotate_label="Oracle-aided",
            **PLOT_KWARGS,
            **plot_selection_kwargs
        )
    print("Freeing memory")
    del experiments_policy_opt_model_free

    # BAYESIAN MODEL BASED
    # --------------------
    print("PLOTTING BAYESIAN MODEL BASED")
    print("-----------------------------")
    print("Loading test experiments")
    experiments_policy_opt_posterior_model = [
        experiment for experiment in load_posterior_test_experiments_generator()
    ]
    print("Getting plots")
    for subplot, plot_selection_kwargs in INDIVIDUAL_PLOTS:
        plot_validation_metrics_per_n_steps_model_learning(
            subplot,
            n_steps_range_all_models,
            experiments_policy_opt_posterior_model,
            *plot_args,
            lineplot_kwargs={"color": "orange"},
            annotate_x_pos=9,
            annotate_label="Bayesian",
            **PLOT_KWARGS,
            **plot_selection_kwargs
        )
    print("Freeing memory")
    del experiments_policy_opt_posterior_model

    # FRQUENTIST MODEL BASED
    # --------------------
    print("PLOTTING BAYESIAN MODEL BASED")
    print("-----------------------------")
    print("Loading test experiments")
    experiments_policy_opt_max_likelihood_model = [
        experiment for experiment in load_frequentist_test_experiments_generator(map_estimator=USE_MAP_ESTIMATOR)
    ]
    print("Getting plots")
    for subplot, plot_selection_kwargs in INDIVIDUAL_PLOTS:
        plot_validation_metrics_per_n_steps_model_learning(
            subplot,
            n_steps_range_all_models,
            experiments_policy_opt_max_likelihood_model,
            *plot_args,
            lineplot_kwargs={"color": "green"},
            annotate_x_pos=6,
            annotate_label="Frequentist",
            **PLOT_KWARGS,
            **plot_selection_kwargs
        )
    print("Freeing memory")
    del experiments_policy_opt_max_likelihood_model

    # Plot params
    print("Setting up plot parameters")
    figsize = (10, 4)
    for subplot, _ in INDIVIDUAL_PLOTS:
        fig, ax = subplot
        fig.set_size_inches(*figsize)

In [None]:
load_experiments_and_compute_plots()

### Plots

In [None]:
# Return
INDIVIDUAL_PLOTS[0][0][0]

In [None]:
# Throughput
INDIVIDUAL_PLOTS[1][0][0]

In [None]:
# Buffer Occupancy
INDIVIDUAL_PLOTS[2][0][0]

In [None]:
# Overflow
INDIVIDUAL_PLOTS[4][0][0]

In [None]:
# Collision
INDIVIDUAL_PLOTS[3][0][0]