In [1]:
import PPO
from env import MPSPEnv
import wandb
import numpy as np
import torch
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'main.ipynb'
torch.set_printoptions(sci_mode=False)
wandb.login()

In [2]:
config = {
    # Env
    'ROWS': 3,
    'COLUMNS': 3,
    'N_PORTS': 5,
    # Training
    'EPISODES': 2000,
    'LEARNING_RATE': 0.001,
    'ADAM_EPSILON': 0.01,
    'MEM_SIZE': 10000,
    'BATCH_SIZE': 100,
    'GAMMA': 0.95,
    'EXPLORATION_MAX': 1.0,
    'EXPLORATION_DECAY': 0.999,
    'EXPLORATION_MIN': 0.005,
    'EVAL_EPISODES': 50,
    'MAX_EPISODE_STEPS': 200,
    'TARGET_UPDATE_FREQ': 500,
    'GRADIENT_CLIP': 5,
    # Model
    'HIDDEN_SIZE': 256,
    'N_LAYERS': 4,
    'ALPHA': 0.0003,
    'EPOCHS': 4,
    'N': 20,
}

In [3]:
env = MPSPEnv(
    config['ROWS'],
    config['COLUMNS'],
    config['N_PORTS']
)
# We flatten the observation space
config['OBSERVATION_SPACE'] = (
    np.prod(env.observation_space[0].shape) +
    np.prod(env.observation_space[1].shape)
)
config['ACTION_SPACE'] = env.action_space.n


In [4]:
agent = PPO.Agent(n_actions=config['ACTION_SPACE'], batch_size=config['BATCH_SIZE'], 
                    alpha=config['ALPHA'], n_epochs=config['EPOCHS'], 
                    input_dims=config['OBSERVATION_SPACE'])

In [7]:
best_score = env.reward_range[0]
score_history = []

learn_iters = 0
avg_score = 0
n_steps = 0

for i in range(config['EPISODES']):
    state, info = env.reset()
    
    done = False
    score = 0
    while not done:
        state = np.concatenate((state[0].flatten(), state[1].flatten()))
        action, prob, val = agent.choose_action(state, info['mask'])
        state_, reward, done, info = env.step(action)
        n_steps += 1
        score += reward
        agent.remember(state, action, prob, val, reward, done)
        if n_steps % config['N'] == 0:
            agent.learn()
            learn_iters += 1
        state = state_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
        # agent.save_models()

    # print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
    #         'time_steps', n_steps, 'learning_steps', learn_iters)

    wandb.log({
        "Score": score,
        "Avg. Score": avg_score,
    })
x = [i+1 for i in range(len(score_history))]

episode 0 score -34.0 avg score -34.0 time_steps 104 learning_steps 5
episode 1 score -513.0 avg score -273.5 time_steps 1168 learning_steps 58
episode 2 score -31.0 avg score -192.7 time_steps 1273 learning_steps 63
episode 3 score 5.0 avg score -143.2 time_steps 1302 learning_steps 65
episode 4 score 7.0 avg score -113.2 time_steps 1320 learning_steps 66
episode 5 score 9.0 avg score -92.8 time_steps 1344 learning_steps 67
episode 6 score 2.0 avg score -79.3 time_steps 1372 learning_steps 68
episode 7 score 6.0 avg score -68.6 time_steps 1391 learning_steps 69
episode 8 score 8.0 avg score -60.1 time_steps 1408 learning_steps 70
episode 9 score 8.0 avg score -53.3 time_steps 1435 learning_steps 71
episode 10 score 9.0 avg score -47.6 time_steps 1457 learning_steps 72
episode 11 score 10.0 avg score -42.8 time_steps 1480 learning_steps 74
episode 12 score 9.0 avg score -38.8 time_steps 1501 learning_steps 75
episode 13 score 6.0 avg score -35.6 time_steps 1517 learning_steps 75
episod