In [None]:
from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from CustomEncoder import CustomCombinedExtractor
from wandb.integration.sb3 import WandbCallback
from sb3_contrib.ppo_mask import MaskablePPO
from benchmark import get_benchmarking_data
from env import MPSPEnv
import numpy as np
import torch
import wandb
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'sb3.ipynb'
os.environ["WANDB_SILENT"] = 'true'
wandb.login()

In [None]:
config = {
    # Environment
    'ROWS': 6,
    'COLUMNS': 2,
    'N_PORTS': 4,
    # Model
    'EMBEDDING_SIZE': 10,
    'PI_LAYER_SIZES': [256, 512, 256],
    'VF_LAYER_SIZES': [256, 512, 256],
    # Training
    'TOTAL_TIMESTEPS': 1000,
    'START_LEARNING_RATE': 0.00007,
    'END_LEARNING_RATE': 0.000004,
    'BATCH_SIZE': 128
}

In [None]:
run = wandb.init(
    project="PPO-SB3",
    entity="rl-msps",
    sync_tensorboard=True,
    name=f"N{config['N_PORTS']}_R{config['ROWS']}_C{config['COLUMNS']}",
    config=config,
    tags=["test"]
)

In [None]:
env = MPSPEnv(
    config['ROWS'],
    config['COLUMNS'],
    config['N_PORTS']
)
env = Monitor(env)

In [None]:
# def linear_schedule(start, end):
#     """
#     Linear learning rate schedule.

#     :param initial_value: Initial learning rate.
#     :return: schedule that computes
#       current learning rate depending on remaining progress
#     """

#     def func(progress_remaining: float) -> float:
#         """
#         Progress will decrease from 1 (beginning) to 0.

#         :param progress_remaining:
#         :return: current learning rate
#         """
#         return start + progress_remaining * (end - start)

#     return func

In [None]:
policy_kwargs = {
    'activation_fn': torch.nn.ReLU,
    'net_arch': [{
        'pi': config['PI_LAYER_SIZES'],
        'vf': config['VF_LAYER_SIZES']
    }],
    # 'features_extractor_class': CustomCombinedExtractor,
    # 'features_extractor_kwargs': {
    #     'n_ports': config['N_PORTS'],
    #     'embedding_size': config['EMBEDDING_SIZE']
    # }
}

model = MaskablePPO(
    policy='MultiInputPolicy',
    env=env,
    batch_size=config['BATCH_SIZE'],
    verbose=0,
    tensorboard_log=f"runs/{run.id}",
    policy_kwargs=policy_kwargs,
    # learning_rate=linear_schedule(
    #     start=config['START_LEARNING_RATE'],
    #     end=config['END_LEARNING_RATE']
    # )
)

In [None]:
model.learn(
    total_timesteps=config['TOTAL_TIMESTEPS'],
    callback=WandbCallback(
        model_save_path=f"models/{run.id}",
    )
)

In [None]:
eval = evaluate_policy(
    model,
    env,
    n_eval_episodes=1000
)
eval = {
    'mean_reward': eval[0],
    'std_reward': eval[1]
}
run.summary['evaluation'] = eval

In [None]:
eval_data = get_benchmarking_data('rl-mpsp-benchmark/set_2')
eval_data = [
    e for e in eval_data if (
        e['R'] == config['ROWS'] and
        e['C'] == config['COLUMNS'] and
        e['N'] == config['N_PORTS']
    )
]

In [None]:
# Run over eval_data and evaluate the model
eval_rewards = []
# Negative because env returns negative reward for shifts
paper_rewards = [-e['paper_result'] for e in eval_data]
paper_seeds = [e['seed'] for e in eval_data]

for e in eval_data:
    total_reward = 0
    obs = env.reset(
        transportation_matrix=e['transportation_matrix']
    )
    done = False
    while not done:
        action, _ = model.predict(
            obs,
            action_masks=env.action_masks()
        )
        obs, reward, done, _ = env.step(action)
        total_reward += reward

    eval_rewards.append(total_reward)

eval = {
    'mean_reward': np.mean(eval_rewards),
    'mean_paper_reward': np.mean(paper_rewards),
    'rewards': eval_rewards,
    'paper_rewards': paper_rewards,
    'paper_seeds': paper_seeds
}
run.summary['evaluation_benchmark'] = eval

In [None]:
run.finish()