In [1]:
from unityagents import UnityEnvironment
import numpy as np
import sys
import torch
%load_ext autoreload
%autoreload 2
sys.path.append("src")

In [15]:
from buffer import (
    ReplayBuffer,
    Qentry,
    Qcollection
)


from model import (
    Qnet,
    inference_episode,
    train_episode,
    inference_episode_test,
    _state_to_torch
)

from torch.utils.tensorboard import SummaryWriter

## Setting up the Unitity Enviroment that we are going to solve

In [3]:
env = UnityEnvironment(file_name="unity/Banana_Linux/Banana.x86_64")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of actions
action_size = brain.vector_action_space_size

# examine the state space 
state = env_info.vector_observations[0]

state_size = len(state)

print("state:\n\n", state, "\n")
print("state size:\n\n", state_size, "\n")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


state:

 [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ] 

state size:

 37 



## Setting up hyperparameters

In [1]:
# The name of the experiment run as displayed by tensorboard
experiement_name = "final"

# The number of training iterations
iterations = 10000

# The learning-rate used during training of the model
lr = 0.001

# The size of the replay buffer
replay_buffer_size = 20000

# The batchsize for each update run
batch_size = 128
# How many consecutive update runs are we going to do before we are producing new entries to the replaybuffer
update_steps = 20
# Controls the weighing of old entries to the replay buffer
# We normalize the weights of the entries in the replay buffer by the number times we have used the
# the entry to train the model and use sampling_beta as the exponent to that normalizing factor. 
# i.e. sampling_beta -> inf, we only care about new entries, sampling_beta = 0, we do not care of we have seen the 
# entry many times we will sample the entry. 
sampling_beta = 1

# The probability that we will choose one action on random instead of selecting the action with the highest q-value 
epsilon=1.0
# Controls how we are decreasing epsilon during training
eps_decay=0.999 
# The final epsilon
eps_end = 0.01

# Discount factor in the temporal difference q-learning loss, controls the focus on future vs immediate reward 
gamma=0.99

# lowpass filter factor that controls the q-learning's targets update speed in terms
copy_weight_scale = 0.0075

# How many episodes are we using when evaluating the performance of the model
test_iter = 100
# How frequently are we running a validation-run during training
validation_iter = 500

##  Setting of the models

In [None]:
qnet = Qnet(state_size, action_size, lr=lr)

qnet_target = Qnet(state_size, action_size, lr=0)

## Setting up the replaybuffer

In [None]:
replaybuffer = ReplayBuffer(replay_buffer_size)

## Setting up the tensorboard writer

In [None]:
writer = SummaryWriter(f"runs/{experiement_name}")

writer.add_graph(qnet, _state_to_torch(state))

### Load a previous trained model 

In [None]:
# "models/model.ckp" is the best verified model
qnet.load_state_dict(torch.load("models/model.ckp"))

# Train the model

In [7]:
best_test_score = -np.inf

for iteration in range(1, iterations):
    
    epsilon = max(eps_end, eps_decay*epsilon) # decrease epsilon
    
    # Do inference for one episode and store the results in the replaybuffer
    _param_dict = {}
    _param_dict["brain_name"] = brain_name
    _param_dict["gamma"] = gamma
    _param_dict["beta"] = sampling_beta
    _param_dict["reward_default"] = 0
    _param_dict["epsilon"] = epsilon

    replaybuffer, score, hits = inference_episode(
        meta=_param_dict,
        env=env,
        qnet=qnet,
        replaybuffer=replaybuffer
    )
    
    # Train the network using the stored entries in the replaybuffer
    _param_dict = {}
    _param_dict["batch_size"] = batch_size
    _param_dict["gamma"] = gamma
    _param_dict["update_steps"] = update_steps
    _param_dict["replace_sampling"] = False
    _param_dict["beta"] = sampling_beta

    qvalues_target, action = train_episode(
        meta=_param_dict,
        qnet=qnet,
        qnet_target=qnet_target,
        replaybuffer=replaybuffer,
    )
    
    # Write some statitics to tensorboard
    writer.add_scalar("training/kpi/score", score, iteration) # The score
    writer.add_scalar("training/kpi/hits", hits, iteration) # Number of bannanas taken (blue or yellow)
    writer.add_scalar("training/hyper/epsilon", epsilon, iteration) # Epsilon
    
    # Update the target network
    qnet_target.steal_weights(qnet, copy_weight_scale)
    
    # Validate the network 
    if iteration % validation_iter == 0:
        
        _param_dict = {}
        _param_dict["brain_name"] = brain_name
        _param_dict["iterations"] = test_iter
        
        test_score = inference_episode_test(
            meta=_param_dict,
            env=env,
            qnet=qnet,
            train_mode=True,
        )
        
        print(f"iteration: {iteration} - score: {test_score}")
        
        writer.add_scalar("validation/kpi/score", test_score, iteration)
        
        # Create a checkpoing for the best model, according to the test score
        if test_score > best_test_score:
            best_test_score = test_score
            torch.save(qnet.state_dict(), f"models/{experiement_name}.ckp")

iteration: 500 - score: 2.04
iteration: 1000 - score: 0.83
iteration: 1500 - score: 2.59
iteration: 2000 - score: 7.96
iteration: 2500 - score: 7.41
iteration: 3000 - score: 8.96
iteration: 3500 - score: 11.08
iteration: 4000 - score: 13.18
iteration: 4500 - score: 14.73
iteration: 5000 - score: 13.38
iteration: 5500 - score: 10.24
iteration: 6000 - score: 14.83
iteration: 6500 - score: 13.61
iteration: 7000 - score: 10.89
iteration: 7500 - score: 14.23
iteration: 8000 - score: 15.58
iteration: 8500 - score: 10.91
iteration: 9000 - score: 12.49
iteration: 9500 - score: 14.2


### Load the best model 

In [8]:
qnet.load_state_dict(torch.load(f"models/{experiement_name}.ckp"))

<All keys matched successfully>

# Evaluate the model

In [9]:
_param_dict = {}
_param_dict["brain_name"] = brain_name
_param_dict["iterations"] = test_iter

test_score = inference_episode_test(
    meta=_param_dict,
    env=env,
    qnet=qnet,
    train_mode=True,
)

writer.add_hparams(
    hparam_dict = {
        "replay_buffer_size": replay_buffer_size,
        "epsilon_start": epsilon,
        "eps_end": eps_end,
        "eps_decay": eps_decay,
        "gamma": gamma,
        "copy_weight_scale": copy_weight_scale,
    },
    metric_dict={"score": test_score},
    run_name="final_score"
)
test_score

15.55

### Update the stored primary model

In [10]:
torch.save(qnet.state_dict(), "models/model.ckp")