In [1]:
from stable_baselines3.common.env_util import make_vec_env
from wandb.integration.sb3 import WandbCallback
from sb3_contrib.ppo_mask import MaskablePPO
from benchmark import get_benchmarking_data
from env import MPSPEnv
import numpy as np
import torch
import wandb
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'sb3.ipynb'
os.environ["WANDB_SILENT"] = 'true'
wandb.login()

pygame 2.1.2 (SDL 2.0.18, Python 3.9.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


True

In [None]:
from torch import nn

In [None]:
test = nn.Sequential(
                nn.Flatten(),
                nn.Embedding(
                    7,
                    2
                ),
                nn.Flatten(),
)

In [None]:
test_tensor = torch.tensor([[[1, 2, 3], [4, 5, 6]], [[4, 2, 3], [4, 5, 1]]])

In [None]:
square_tensor = torch.tensor([
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
])
square_tensor

In [None]:
# Upper triangular matrix (without diagonal)
indeces  = torch.triu_indices(3, 3, offset=1)

In [None]:
# Extract upper triangular part of square_tensor. Apply along the batch dimension
square_tensor[:, indeces[0], indeces[1]]

In [None]:
test(test_tensor)

In [2]:
config = {
    # Environment
    'ROWS': 10,
    'COLUMNS': 4,
    'N_PORTS': 10,
    # Model
    'PI_LAYER_SIZES': [64, 128, 64],
    'VF_LAYER_SIZES': [64, 128, 64],
    # Training
    'TOTAL_TIMESTEPS': 4800000,
    'BATCH_SIZE': 128
}

In [None]:
run = wandb.init(
    project="PPO-SB3",
    entity="rl-msps",
    sync_tensorboard=True,
    name=f"N{config['N_PORTS']}_R{config['ROWS']}_C{config['COLUMNS']}",
    config=config,
    tags=["test"]
)

In [None]:
env = make_vec_env(
    lambda: MPSPEnv(
        config['ROWS'],
        config['COLUMNS'],
        config['N_PORTS']
    ),
    n_envs=8  # M2 with 8 cores
)

In [3]:
env = MPSPEnv(
        config['ROWS'],
        config['COLUMNS'],
        config['N_PORTS']
    )

In [56]:
bay1 = env.observation_space.sample()['bay_matrix']
bay2 = env.observation_space.sample()['bay_matrix']
bay = torch.tensor([bay1, bay2]).long()
container = torch.tensor([[2], [1]]).long()
print(bay)
print(container)


tensor([[[ 6,  8,  8,  7],
         [ 9,  4,  0,  7],
         [ 1,  9,  2, 10],
         [ 5,  9,  1,  9],
         [ 3,  6,  0,  4],
         [ 1,  7,  3,  7],
         [ 6,  4,  9,  0],
         [ 1,  3,  4,  4],
         [ 3,  5,  1,  1],
         [ 1,  7,  4,  4]],

        [[ 9,  8,  4, 10],
         [ 6,  4,  9,  7],
         [ 4,  7,  1,  2],
         [ 2,  9,  8,  4],
         [ 7,  4,  0,  2],
         [ 4,  1,  0,  2],
         [ 0,  6,  8,  6],
         [10,  5,  3,  4],
         [ 1,  0,  2,  7],
         [ 3,  4,  5,  9]]])
tensor([[2],
        [1]])


In [67]:
embed_size = 2
embedding = torch.nn.Embedding(
    config['N_PORTS']+1,
    embed_size
)

In [77]:
embedding(container)

tensor([[[ 0.5578,  1.5095]],

        [[-2.1424, -0.7622]]], grad_fn=<EmbeddingBackward0>)

In [68]:
linear_input_size = 10 * embed_size
linear_output_size = 5
linear = torch.nn.Linear(linear_input_size, linear_output_size)

In [71]:
output = torch.nn.Flatten(2)(embedding(bay.mT))
print(output)

tensor([[[ 0.9959, -0.6139,  1.8658, -1.1731, -2.1424, -0.7622, -1.1650,
          -0.3566, -0.2853,  0.9731, -2.1424, -0.7622,  0.9959, -0.6139,
          -2.1424, -0.7622, -0.2853,  0.9731, -2.1424, -0.7622],
         [-0.8377, -0.0974,  0.2452,  1.3679,  1.8658, -1.1731,  1.8658,
          -1.1731,  0.9959, -0.6139, -0.5232, -0.7095,  0.2452,  1.3679,
          -0.2853,  0.9731, -1.1650, -0.3566, -0.5232, -0.7095],
         [-0.8377, -0.0974,  1.2323, -1.7347,  0.5578,  1.5095, -2.1424,
          -0.7622,  1.2323, -1.7347, -0.2853,  0.9731,  1.8658, -1.1731,
           0.2452,  1.3679, -2.1424, -0.7622,  0.2452,  1.3679],
         [-0.5232, -0.7095, -0.5232, -0.7095,  1.9376, -0.3788,  1.8658,
          -1.1731,  0.2452,  1.3679, -0.5232, -0.7095,  1.2323, -1.7347,
           0.2452,  1.3679, -2.1424, -0.7622,  0.2452,  1.3679]],

        [[ 1.8658, -1.1731,  0.9959, -0.6139,  0.2452,  1.3679,  0.5578,
           1.5095, -0.5232, -0.7095,  0.2452,  1.3679,  1.2323, -1.7347,
        

In [76]:
linear(output)

tensor([[[-0.6926, -0.5299,  0.0261,  1.6368,  0.0278],
         [ 0.0350, -0.2245, -0.3679,  0.2331, -0.4008],
         [ 0.3575,  0.7104,  0.1020,  0.2330, -0.4625],
         [ 0.7231, -0.1233, -0.3874,  0.3690, -0.5451]],

        [[ 0.5493,  1.0466,  0.5138, -0.3125,  1.0142],
         [-0.4114,  0.1759, -0.1001, -0.2051, -1.0957],
         [-0.7512, -0.5073,  0.8956,  0.6195, -0.0052],
         [ 0.7329, -0.3540, -0.1081, -0.1655,  0.1595]]],
       grad_fn=<ViewBackward0>)

In [74]:
torch.nn.Flatten()(linear(output))

tensor([[-0.6926, -0.5299,  0.0261,  1.6368,  0.0278,  0.0350, -0.2245, -0.3679,
          0.2331, -0.4008,  0.3575,  0.7104,  0.1020,  0.2330, -0.4625,  0.7231,
         -0.1233, -0.3874,  0.3690, -0.5451],
        [ 0.5493,  1.0466,  0.5138, -0.3125,  1.0142, -0.4114,  0.1759, -0.1001,
         -0.2051, -1.0957, -0.7512, -0.5073,  0.8956,  0.6195, -0.0052,  0.7329,
         -0.3540, -0.1081, -0.1655,  0.1595]], grad_fn=<ReshapeAliasBackward0>)

In [None]:
env._get_long_distance_transportation_matrix(10)

In [None]:
env._get_short_distance_transportation_matrix(10)

In [None]:
env._get_mixed_distance_transportation_matrix(10)

In [None]:
env = MPSPEnv(
        config['ROWS'],
        config['COLUMNS'],
        config['N_PORTS']
)

In [None]:
observation = env.reset()

In [None]:
import torch.nn as nn

In [None]:
conv1 = nn.Conv2d(
    in_channels=1,
    out_channels=3,
    kernel_size=3,
    stride=1,
    padding=1
)

In [None]:
input = torch.tensor(observation['bay_matrix'], dtype=torch.float32).unsqueeze(0)
input

In [None]:
input = conv1(input)
input

In [None]:
pool = nn.MaxPool2d(2, 2)
input = pool(input)
input

In [None]:
observation

In [None]:
policy_kwargs = {
    'activation_fn': torch.nn.ReLU,
    'net_arch': [{
        'pi': config['PI_LAYER_SIZES'],
        'vf': config['VF_LAYER_SIZES']
    }]
}

wandb_run_path = None

if wandb_run_path:
    model_file = wandb.restore('model.zip', run_path=wandb_run_path)
    model = MaskablePPO.load(
        model_file.name,
        env=env
    )
else:
    model = MaskablePPO(
        policy='MultiInputPolicy',
        env=env,
        batch_size=config['BATCH_SIZE'],
        verbose=0,
        # tensorboard_log=f"runs/{run.id}",
        policy_kwargs=policy_kwargs
    )

In [None]:
vars(model)

In [None]:
model.learn(
    total_timesteps=config['TOTAL_TIMESTEPS'],
    callback=WandbCallback(
        model_save_path=f"models/{run.id}",
        model_save_freq=config['TOTAL_TIMESTEPS'] // 4,
    )
)

In [None]:
eval_data = get_benchmarking_data('rl-mpsp-benchmark/set_2')
eval_data = [
    e for e in eval_data if (
        e['R'] == config['ROWS'] and
        e['C'] == config['COLUMNS'] and
        e['N'] == config['N_PORTS']
    )
]

In [None]:
# Creating seperate env for evaluation
env = MPSPEnv(
    config['ROWS'],
    config['COLUMNS'],
    config['N_PORTS']
)

eval_rewards = []
# Negative because env returns negative reward for shifts
paper_rewards = [-e['paper_result'] for e in eval_data]
paper_seeds = [e['seed'] for e in eval_data]

for e in eval_data:
    total_reward = 0
    obs = env.reset(
        transportation_matrix=e['transportation_matrix']
    )
    done = False
    while not done:
        action, _ = model.predict(
            obs,
            action_masks=env.action_masks()
        )
        obs, reward, done, _ = env.step(action)
        total_reward += reward

    eval_rewards.append(total_reward)

eval = {
    'mean_reward': np.mean(eval_rewards),
    'mean_paper_reward': np.mean(paper_rewards),
    'rewards': eval_rewards,
    'paper_rewards': paper_rewards,
    'paper_seeds': paper_seeds
}
run.summary['evaluation_benchmark'] = eval

In [None]:
run.finish()