In [3]:
from ReplayBuffer import ReplayBuffer
from DQN_Solver import DQN_Solver
from env import MPSPEnv
from DQN import DQN
import numpy as np
import wandb
import torch
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'main.ipynb'
wandb.login()

True

In [4]:
config = {
    # Env
    'ROWS': 3,
    'COLUMNS': 3,
    'N_PORTS': 4,
    # Training
    'EPISODES': 1000,
    'LEARNING_RATE': 0.00001,
    'MEM_SIZE': 10000,
    'BATCH_SIZE': 64,
    'GAMMA': 0.95,
    'EXPLORATION_MAX': 1.0,
    'EXPLORATION_DECAY': 0.999,
    'EXPLORATION_MIN': 0.001,
    # Model
    'HIDDEN_SIZE_1': 1024,
    'HIDDEN_SIZE_2': 512,
}

In [5]:
env = MPSPEnv(
    config['ROWS'],
    config['COLUMNS'],
    config['N_PORTS']
)
# We flatten the observation space
config['OBSERVATION_SPACE'] = (
    np.prod(env.observation_space[0].shape) +
    np.prod(env.observation_space[1].shape)
)
config['ACTION_SPACE'] = env.action_space.n

In [6]:
wandb.init(
    project="Q-learning",
    entity="rl-msps",
    name=f"{config['ROWS']}x{config['COLUMNS']}_{config['N_PORTS']}-ports",
    config=config,
    tags=["test"]
)

In [7]:
ReplayBuffer = ReplayBuffer(
    mem_size=config['MEM_SIZE'],
    observation_space=config['OBSERVATION_SPACE'],
    batch_size=config['BATCH_SIZE']
)
DQN = DQN(
    input_size=config['OBSERVATION_SPACE'],
    output_size=config['ACTION_SPACE'],
    hidden_size_1=config['HIDDEN_SIZE_1'],
    hidden_size_2=config['HIDDEN_SIZE_2'],
    learning_rate=config['LEARNING_RATE']
)
agent = DQN_Solver(
    ReplayBuffer=ReplayBuffer,
    DQN=DQN,
    batch_size=config['BATCH_SIZE'],
    exploration_max=config['EXPLORATION_MAX'],
    gamma=config['GAMMA'],
    exploration_decay=config['EXPLORATION_DECAY'],
    exploration_min=config['EXPLORATION_MIN']
)


In [8]:
for i in range(1, config['EPISODES']):
    state, info = env.reset()
    state = np.concatenate((state[0].flatten(), state[1].flatten()))
    score = 0
    sum_loss = 0
    count = 0

    while True:
        action = agent.choose_action(state, info['mask'], env)
        state_, reward, done, info = env.step(action)
        state_ = np.concatenate((state_[0].flatten(), state_[1].flatten()))
        agent.memory.add(state, action, reward, state_, done)
        sum_loss += agent.learn()
        state = state_
        score += reward
        count += 1

        if done:
            wandb.log({
                "Episode Reward": score,
                "Average Episode Loss": sum_loss / count,
                "Exploration Rate": agent.exploration_rate
            })
            break


In [9]:
torch.save(DQN.state_dict(), os.path.join(wandb.run.dir, "dqn.pt"))
wandb.finish()

VBox(children=(Label(value='0.001 MB of 2.139 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.000469…

0,1
Average Episode Loss,▁▁▂▄▇▃▂▅█▄▄▃▃▂▄▁▄▂▂▃▃▃▄▄▃▄▆▆▃▂▂▂▄▃▃▂▄▃▃▄
Episode Reward,▁█▇█▇████████████████▇██████████████████
Exploration Rate,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Average Episode Loss,0.00875
Episode Reward,0.0
Exploration Rate,0.001
