# Optical RL-Gym

## Training the Stable Baselines agents using the DeepRLSA environment

This file contains examples of how to train agents for the DeepRMSA environment.

The agents used in this file come from the [Stable baselines](https://github.com/hill-a/stable-baselines) framework.

This notebook is based upon the one available [here](https://github.com/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb).

Before running this notebook, make sure to install Stable Baselines and the Optical RL-Gym in your Python environment.

### General imports

In [1]:
import os
import pickle
import numpy as np
from IPython.display import clear_output

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
import tensorflow as tf
# silencing tensorflow warnings
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
tf.__version__ # printint out tensorflow version used

'2.9.1'

### Stable Baseline imports

In [3]:
import stable_baselines3
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as MLP_PPO
from stable_baselines3 import DQN
from stable_baselines3.dqn.policies import MlpPolicy as MLP_DQN
from sb3_contrib import TRPO
from sb3_contrib.trpo import MlpPolicy as MLP_TRPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter

stable_baselines3.__version__ # printing out stable_baselines version used

  from .autonotebook import tqdm as notebook_tqdm


'1.4.1a0'

### Environment imports

In this particular example, there is no need to import anything specific to the Optical RL-Gym. Only by importing the Open AI Gym below, you already get all the functionality needed.

In [4]:
import gym

### Define a callback function

In [5]:
# callback from https://stable-baselines.readthedocs.io/en/master/guide/examples.html#using-callback-monitoring-training
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = log_dir
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        # if self.save_path is not None:
        #     os.makedirs(self.save_path, exist_ok=True)
        return

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                 # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {} - ".format(self.num_timesteps), end="")
                    print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))
                  # New best model, you could save the agent here
                if mean_reward >= self.best_mean_reward and self.num_timesteps>2000:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(self.save_path))
                        self.model.save(self.save_path+'best_model.zip')
                if self.verbose > 0:
                    clear_output(wait=True)

        return True

### Setting up the environment

The parameters are set as in the [DeepRMSA](https://doi.org/10.1109/JLT.2019.2923615) work and its [available reporitory](https://github.com/xiaoliangchenUCD/DeepRMSA).

In [6]:
from explainable.utils import linear_schedule

alg_name = 'PPO'
top_name = 'nsfnet'
k_path = 3



topology_dir = '/topologies/demo/' +  top_name +f'_{k_path}.h5'
with open(f'..{topology_dir}', 'rb') as f:
    topology = pickle.load(f)

node_request_probabilities = np.array([])
if top_name == 'arpanet':
    # ---------------------- ARPANET ----------------------
    node_request_probabilities = np.array(
        [0.10131117, 0.12078696, 0.06144304, 0.00394418, 0.06218475,
        0.04044608, 0.09256297, 0.02113283, 0.02084576, 0.07330581,
        0.04822402, 0.01407012, 0.0251201 , 0.04523283, 0.12847282,
        0.01805554, 0.01488939, 0.03013041, 0.06110418, 0.01673704]
    )
elif top_name == 'eon':
    # ---------------------- EON ----------------------
    node_request_probabilities = np.array(
        [0.13956028, 0.02775406, 0.1583229 , 0.01276534, 0.06687379,
        0.02519223, 0.02306825, 0.01166695, 0.0594671 , 0.00071904,
        0.06957169, 0.13642354, 0.03778149, 0.05543918, 0.07873654,
        0.02076745, 0.00419003, 0.02320005, 0.01052078, 0.03797931]
    )
elif top_name == 'eurocore':
    # ---------------------- EUROCORE ----------------------
    node_request_probabilities = np.array(
        [0.01711661, 0.05418066, 0.11466408, 0.37467221, 0.01244822,
        0.00672383, 0.00170215, 0.14903192, 0.20510173, 0.02759766,
        0.03676094]
    )
elif top_name == 'italiana':
    # ---------------------- ITALIANA ----------------------
    node_request_probabilities = np.array(
        [0.06646663, 0.28975685, 0.04804817, 0.12453275, 0.09512295,
        0.05196806, 0.02895454, 0.0071567 , 0.02332887, 0.05678903,
        0.0026715 , 0.00254033, 0.0123507 , 0.00230415, 0.02801925,
        0.00800734, 0.01208697, 0.02598813, 0.06745542, 0.02508004,
        0.0213716]
    )
elif top_name == 'nsfnet':
    # ---------------------- NSFNET ----------------------
    node_request_probabilities = np.array([0.01801802, 0.04004004, 0.05305305, 0.01901902, 0.04504505,
        0.02402402, 0.06706707, 0.08908909, 0.13813814, 0.12212212,
        0.07607608, 0.12012012, 0.01901902, 0.16916917])
elif top_name == 'uknet':
    # ---------------------- UKNET ----------------------
    node_request_probabilities = np.array(
        [0.06646663, 0.28975685, 0.04804817, 0.12453275, 0.09512295,
        0.05196806, 0.02895454, 0.0071567 , 0.02332887, 0.05678903,
        0.0026715 , 0.00254033, 0.0123507 , 0.00230415, 0.02801925,
        0.00800734, 0.01208697, 0.02598813, 0.06745542, 0.02508004,
        0.0213716]
    )
else:
    # ---------------------- USNET ----------------------
    node_request_probabilities = np.array(
        [1.23807304e-02, 2.92335629e-02, 2.62436887e-06, 8.26014201e-03,
        3.64143708e-03, 2.22290607e-03, 4.72909952e-03, 9.72772742e-03,
        1.15971163e-02, 1.77577532e-02, 1.24663926e-02, 2.65205341e-02,
        5.24785875e-03, 4.82902294e-02, 6.37146993e-04, 2.54697119e-02,
        1.23918630e-02, 1.87683811e-02, 3.47080980e-03, 5.06542659e-03,
        3.70125617e-02, 7.91621028e-02, 8.62783971e-03, 2.70442037e-02,
        4.79671702e-02, 5.16253403e-02, 2.03925432e-03, 9.14041312e-04,
        4.27046339e-03, 4.82949487e-02, 2.37528831e-03, 1.25420925e-02,
        7.26742589e-02, 1.74783004e-02, 2.70110059e-02, 8.69783866e-03,
        2.66141267e-02, 4.12887779e-02, 4.23491085e-04, 3.18204224e-02,
        1.03186416e-01, 3.16394394e-02, 7.55135878e-03, 3.57289387e-02,
        2.49978391e-03, 1.36290810e-02]
    )

env_args = dict(topology=topology, seed=10, 
                allow_rejection=False, 
                k_paths=k_path,
                j=1,
                mean_service_holding_time=7.5, 
                mean_service_inter_arrival_time=0.1,
                episode_length=50, node_request_probabilities=node_request_probabilities,
                only_spectrum_obs = False)

### Creating the monitors and agent
# Create log dir
log_dir = "./tmp/" + top_name + f"_{k_path}/"
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir)
env = gym.make('DeepRMSA-v0', **env_args)

# logs will be saved in log_dir/training.monitor.csv
# in this case, on top of the usual monitored things, we also monitor service and bit rate blocking rates
env = Monitor(env, log_dir, info_keywords=('episode_service_blocking_rate','episode_bit_rate_blocking_rate'))
# for more information about the monitor, check https://stable-baselines.readthedocs.io/en/master/_modules/stable_baselines/bench/monitor.html#Monitor

# here goes the arguments of the policy network to be used
policy_args = dict(net_arch=5*[128]) # we use the elu activation function
tensorboard_log = "./tb/" + top_name + f"_{k_path}/"

if alg_name == 'PPO':
    agent = PPO(MLP_PPO, env, verbose=0, tensorboard_log=tensorboard_log, policy_kwargs=policy_args, gamma=.95, learning_rate=linear_schedule(5*10e-5, 5*10e-6), seed=10)
elif alg_name == 'DQN':
    agent = DQN(MLP_DQN, env, verbose=0, tensorboard_log=tensorboard_log, policy_kwargs=policy_args, gamma=.95, learning_rate=linear_schedule(10e-4),seed=10)
else: # TRPO
    agent = TRPO(MLP_TRPO, env, verbose=0, tensorboard_log=tensorboard_log, policy_kwargs=policy_args, gamma=.95, learning_rate=linear_schedule(10e-4),seed=10)

### Training the agent:
a = agent.learn(total_timesteps=1000000, callback=callback)

Num timesteps: 1001400 - Best mean reward: 44.44 - Last mean reward per episode: 41.92
