In [1]:
from unityagents import UnityEnvironment
import numpy as np
import sys
import torch
%load_ext autoreload
%autoreload 2
sys.path.append("src")

In [2]:
from buffer import (
    ReplayBuffer,
    Qentry,
    Qcollection
)


from model import (
    Qnet,
    inference_episode,
    train_episode,
    inference_episode_test,
)
from typing import Dict, Any

from torch.utils.tensorboard import SummaryWriter

In [5]:
env = UnityEnvironment(file_name="unity/Banana_Linux/Banana.x86_64")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment

# number of actions
action_size = brain.vector_action_space_size

# examine the state space 
state = env_info.vector_observations[0]

state_size = len(state)

print("state:\n\n", state, "\n")
print("state size:\n\n", state_size, "\n")

state:

 [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ] 

state size:

 37 



In [6]:
experiement_name = "final"

lr = 0.001

replay_buffer_size = 20000

batch_size = 128
update_steps = 20
sampling_beta = 1

epsilon=1.0
eps_end = 0.01
eps_decay=0.999

gamma=0.99

copy_weight_scale = 0.0075

test_iter = 100
validation_iter = 500

In [9]:
qnet = Qnet(state_size, action_size, lr=lr)
qnet_target = Qnet(state_size, action_size, lr=0)

replaybuffer = ReplayBuffer(replay_buffer_size)

writer = SummaryWriter(f"runs/{experiement_name}")

In [8]:
qnet.load_state_dict(torch.load("models/model.ckp"))

<All keys matched successfully>

In [None]:
best_test_score = -np.inf

for iteration in range(1, 10000):
    
    epsilon = max(eps_end, eps_decay*epsilon) # decrease epsilon
    
    _param_dict = {}
    _param_dict["brain_name"] = brain_name
    _param_dict["gamma"] = gamma
    _param_dict["beta"] = sampling_beta
    _param_dict["reward_default"] = 0
    _param_dict["epsilon"] = epsilon

    replaybuffer, score, hits = inference_episode(
        meta=_param_dict,
        env=env,
        qnet=qnet,
        replaybuffer=replaybuffer
    )

    _param_dict = {}
    _param_dict["batch_size"] = batch_size
    _param_dict["gamma"] = gamma
    _param_dict["update_steps"] = update_steps
    _param_dict["replace_sampling"] = False
    _param_dict["beta"] = sampling_beta

    qvalues_target, action = train_episode(
        meta=_param_dict,
        qnet=qnet,
        qnet_target=qnet_target,
        replaybuffer=replaybuffer,
    )
    
    writer.add_scalar("training/kpi/score", score, iteration)
    writer.add_scalar("training/kpi/hits", hits, iteration)
    writer.add_scalar("training/hyper/epsilon", epsilon, iteration)
    
    qnet_target.steal_weights(qnet, copy_weight_scale)
    
    if iteration % validation_iter == 0:
        
        _param_dict = {}
        _param_dict["brain_name"] = brain_name
        _param_dict["iterations"] = test_iter
        
        test_score = inference_episode_test(
            meta=_param_dict,
            env=env,
            qnet=qnet,
            train_mode=True,
        )
        
        print(f"iteration: {iteration} - score: {test_score}")
        
        writer.add_scalar("validation/kpi/score", test_score, iteration)
        
        if test_score > best_test_score:
            test_score = best_test_score
            torch.save(qnet.state_dict(), f"models/{experiement_name}.ckp")

iteration: 500 - score: 0.75
iteration: 1000 - score: 2.27


In [31]:
_param_dict = {}
_param_dict["brain_name"] = brain_name
_param_dict["iterations"] = test_iter

test_score = inference_episode_test(
    meta=_param_dict,
    env=env,
    qnet=qnet,
    train_mode=True,
)

writer.add_hparams(
    hparam_dict = {
        "replay_buffer_size": replay_buffer_size,
        "epsilon_start": epsilon,
        "eps_end": eps_end,
        "eps_decay": eps_decay,
        "gamma": gamma,
        "copy_weight_scale": copy_weight_scale,
    },
    metric_dict={"score": test_score},
    run_name="final_score"
)

In [None]:
torch.save(qnet.state_dict(), "models/model.ckp")