In [14]:
import PPO
from env import MPSPEnv
import wandb
import numpy as np
import torch
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'ppo.ipynb'
torch.set_printoptions(sci_mode=False)
wandb.login()



True

In [15]:
config = {
    # Env
    'ROWS': 5,
    'COLUMNS': 5,
    'N_PORTS': 7,
    # Training
    'EPISODES': 2000,
    'LEARNING_RATE': 0.001,
    'ADAM_EPSILON': 0.01,
    'MEM_SIZE': 10000,
    'BATCH_SIZE': 100,
    'GAMMA': 0.95,
    'EXPLORATION_MAX': 1.0,
    'EXPLORATION_DECAY': 0.999,
    'EXPLORATION_MIN': 0.005,
    'EVAL_EPISODES': 50,
    'MAX_EPISODE_STEPS': 200,
    'TARGET_UPDATE_FREQ': 500,
    'GRADIENT_CLIP': 5,
    # Model
    'HIDDEN_SIZE': 256,
    'N_LAYERS': 4,
    'ALPHA': 0.0003,
    'EPOCHS': 4,
    'N': 20,
}

In [16]:
env = MPSPEnv(
    config['ROWS'],
    config['COLUMNS'],
    config['N_PORTS']
)
# We flatten the observation space
config['OBSERVATION_SPACE'] = (
    np.prod(env.observation_space[0].shape) +
    np.prod(env.observation_space[1].shape)
)
config['ACTION_SPACE'] = env.action_space.n


In [17]:
agent = PPO.Agent(n_actions=config['ACTION_SPACE'], batch_size=config['BATCH_SIZE'], 
                    alpha=config['ALPHA'], n_epochs=config['EPOCHS'], 
                    input_dims=config['OBSERVATION_SPACE'])

In [18]:
wandb.init(
    project="PPO",
    entity="rl-msps",
    name=f"{config['ROWS']}x{config['COLUMNS']}_{config['N_PORTS']}-ports",
    config=config,
    tags=["test"]
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Avg. Score,▁▆▆▇▇▇▆▆▇▅▄▅█▇▆▆▅▃▂▂▄▂▂▅▅▆▇▇▅▅▅▆▇▆▅▅▆▆▇▇
Score,▆██▃██████▆▆▆▃▆▆█▆█▆█▃█▁█████▆████▆█████

0,1
Avg. Score,9.39
Score,8.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.03339629968007406, max=1.0)…

In [19]:
best_score = env.reward_range[0]
score_history = []

learn_iters = 0
avg_score = 0
n_steps = 0

for i in range(config['EPISODES']):
    state, info = env.reset()
    
    done = False
    score = 0
    while not done:
        state = np.concatenate((state[0].flatten(), state[1].flatten()))
        action, prob, val = agent.choose_action(state, info['mask'])
        state_, reward, done, info = env.step(action)
        n_steps += 1
        score += reward
        agent.remember(state, action, prob, val, reward, done)
        if n_steps % config['N'] == 0:
            agent.learn()
            learn_iters += 1
        state = state_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
        # agent.save_models()

    # print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
    #         'time_steps', n_steps, 'learning_steps', learn_iters)

    wandb.log({
        "Score": score,
        "Avg. Score": avg_score,
    })
x = [i+1 for i in range(len(score_history))]

In [20]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Avg. Score,▁▆██████████████████████████████████████
Score,▆▆▇▄▅▁▂▅▁▆▁▆▇▇▅▄▆▄▆▅▇▃▇▆▆▅▃▇▇▄▅▇▆▇▆██▇▆▅

0,1
Avg. Score,1.59
Score,6.0
