# Deep Recurrent Q-Networks in Pytorch



In [1]:
import copy
import math
import os
from collections import namedtuple
import gym
import ipywidgets as widgets
import matplotlib.pyplot as plt
import more_itertools as mitt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
from collections import deque
import random
import sys
from datetime import datetime
from baselines import logger
import pickle

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 4]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def set_global_seeds(seed):
    """Set random seeds for numpy, random, and pytorch

    Args:
        seed: random seed to be set
    Returns:
        None
    """    
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

In [3]:
def rolling_average(data, *, window_size):
    """Smoothen the 1-d data array using a rollin average.

    Args:
        data: 1-d numpy.array
        window_size: size of the smoothing window

    Returns:
        smooth_data: a 1-d numpy.array with the same size as data
    """
    assert data.ndim == 1
    kernel = np.ones(window_size)
    smooth_data = np.convolve(data, kernel) / np.convolve(
        np.ones_like(data), kernel
    )
    return smooth_data[: -window_size + 1]

In [4]:
def select_action_epsilon_greedy(dqn_model, obs, hidden_state, cell_state, eps, device, env):
    """Select action using epsilon greedy.

    Args:
        dqn_model: The current estimate of Q-value
        obs: observation
        hidden_state: hidden state
        cell_state: cell state
        eps: current value of epsilon
        device: cuda or cpu
        env: current environment used

    Returns:
        action, next hidden_state, next cell_state
    """    
    
    with torch.no_grad():
        if np.random.rand() < eps:
                torch_x = torch.FloatTensor(obs).to(device)
                model_out = dqn_model.forward(torch_x, 1, 1, hidden_state, cell_state)
                action = np.random.randint(0, env.action_space.n)
        else:
                torch_x = torch.FloatTensor(obs).to(device)
                model_out = dqn_model.forward(torch_x, 1, 1, hidden_state, cell_state)
                out = model_out[0]
                action = int(torch.argmax(out[0]))

        hidden_state = model_out[1][0]
        cell_state = model_out[1][1]
    return action, hidden_state, cell_state

In [5]:

class LinearSchedule:
    def __init__(self, value_from, value_to, nsteps):
        """Linear schedule from `value_from` to `value_to` in `nsteps` steps.

        :param value_from: initial value
        :param value_to: final value
        :param nsteps: number of steps for the linear schedule
        """
        self.value_from = value_from
        self.value_to = value_to
        self.nsteps = nsteps

    def value(self, step) -> float:
        """Return interpolated value between `value_from` and `value_to`.

        returns {
            `value_from`, if step == 0 or less
            `value_to`, if step == nsteps - 1 or more
            the interpolation between `value_from` and `value_to`, if 0 <= steps < nsteps
        }

        :param step:  The step at which to compute the interpolation.
        :rtype: float.  The interpolated value.
        """

        if (step < 0):
            return self.value_from
        
        if (step >= self.nsteps - 1):
            return self.value_to
        
        step_size = (self.value_to - self.value_from) / (self.nsteps - 1);
        value = self.value_from + step * step_size; 
        return value

In [6]:
class RecurrentReplayMemory:
    def __init__(self, max_size, episode_min_len, episode_max_len):
        """Replay memory implemented as a queue.

        Args:
            - max_size: Maximum size of the buffer.
            - episode_min_len: minimum length of an eligible episode
            - episode_max_len: maximimum length of an eligible episode
        """
        self.max_size = max_size
        self.memory = deque(maxlen = self.max_size)
        self.episode_min_len = episode_min_len
        self.episode_max_len = episode_max_len

    def add_episode(self, episode):
        """Add an episode to the buffer.

        :param episode:  episode to add.
        """
        assert len(episode) >= self.episode_min_len
        self.memory.append(episode)

    def pre_populate(self, env, replay_prepopulate_episodes):
        """Prepopulate the replay buffer before training.

        Args:
            - env: Environment to run
            - replay_prepopulate_episodes: How many episodes to pre-populate
        """

        episode_cnt = 0
        while episode_cnt < replay_prepopulate_episodes:
            
            state = env.reset()
            state = state[[0, 2]]

            step_count = 0
            episode = []
            
            while step_count < self.episode_max_len:
                
                step_count += 1
                action = np.random.randint(0, env.action_space.n)
                next_state, reward, done, _ = env.step(action)
                next_state = next_state[[0, 2]]

                episode.append((state, action, reward, next_state, done))
                
                state = next_state
                
                if done:
                    break
            
            if (len(episode) > self.episode_min_len):
                self.add_episode(episode)
                episode_cnt += 1
                
        print('Done pre-populated with %d episodes'%(len(self.memory)))         
        

    def sample(self, batch_size, episode_len):
        """Sample a batch of episodes.

        :param batch_size:  Number of episodes to sample.
        :param episode_len:  Minimum length of the episode.
        :rtype: Batch (list)
        """

        sampled_episodes = random.sample(self.memory, batch_size)
        batch = []
        for episode in sampled_episodes:
            if len(episode) + 1 - episode_len > 0:
                point = np.random.randint(0, len(episode) + 1 - episode_len)
                batch.append(episode[point : point + episode_len])

        assert len(batch) == batch_size
        return batch

In [7]:
class DRQN(nn.Module):
    def __init__(self, obs_dim=2, action_dim=4, fc_hidden_dim=256, rnn_hidden_dim=512):
        """Deep Recurrent Q-Network PyTorch model.

        Args:
            - obs_dim: Dimensionality of observations
            - action_dim: Dimensionality of actions
            - hidden_dim: Dimesionality of the rnn hidden layer
        """

        super().__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.fc_hidden_dim = fc_hidden_dim
        self.rnn_hidden_dim = rnn_hidden_dim

        self.fc1 = nn.Linear(self.obs_dim, self.fc_hidden_dim)
        self.fc2 = nn.Linear(self.fc_hidden_dim, self.fc_hidden_dim)

        self.lstm = nn.LSTM(input_size = self.fc_hidden_dim, hidden_size = self.rnn_hidden_dim, num_layers = 1, batch_first = True)
        self.fc4 = nn.Linear(self.rnn_hidden_dim, self.action_dim)

    def forward(self, observations, bsize, episode_length, hidden_state, cell_state) -> torch.Tensor:
        """Q function mapping from states to action-values.

        :param obs: (*, S) torch.Tensor where * is any number of additional
                dimensions, and S is the dimensionality of observation-space.
        :rtype: (*, A) torch.Tensor where * is the same number of additional
                dimensions as the `states`, and A is the dimensionality of the
                action-space.  This represents the Q values Q(s, .).
        """
        observations = observations.view(bsize * episode_length, 1, self.obs_dim)
        x = F.relu(self.fc1(observations))
        x = F.relu(self.fc2(x))

        x = x.view(bsize, episode_length, self.fc_hidden_dim)
        lstm_out = self.lstm(x, (hidden_state, cell_state))
#         out = lstm_out[0][:, episode_length-1, :]
        out = lstm_out[0]
        h_n = lstm_out[1][0]
        c_n = lstm_out[1][1]

        x = self.fc4(out)

        return x, (h_n, c_n)     
        
    def init_hidden_states(self, bsize):
        """Init hidden state values.

        :param bsize: batch_size
        :rtype: zeros tensors
        """

        h = torch.zeros(1, bsize, self.rnn_hidden_dim).float().to(device)
        c = torch.zeros(1, bsize, self.rnn_hidden_dim).float().to(device)
        
        return h,c

### Single batch-update

In [8]:
def train_drqn_batch(optimizer, batch, episode_training_len, dqn_model, dqn_target, gamma):
    """Perform a single batch-update step on the given DQN model.
    :param optimizer: nn.optim.Optimizer instance.
    :param batch:  Batch of episodes.
    :param episode_training_len: The length of the episode to be trained (fixed).
    :param dqn_model:  The DQN model to be trained.
    :param dqn_target:  The target DQN model, ~NOT~ to be trained.
    :param gamma:  The discount factor.
    :rtype: float  The scalar loss associated with this batch.
    """    
    current_states = []
    acts = []
    rewards = []
    next_states = []
    dones = []

    batch_size = len(batch)

    hidden_batch, cell_batch = dqn_model.init_hidden_states(batch_size)

    for b in batch:
        cs, ac, rw, ns, ds = [],[],[],[],[]
        for element in b:
            cs.append(element[0])
            ac.append(element[1])
            rw.append(element[2])
            ns.append(element[3])
            ds.append(element[4])
        current_states.append(cs)
        acts.append(ac)
        rewards.append(rw)
        next_states.append(ns)
        dones.append(ds)
    
    torch_current_states = torch.FloatTensor(current_states).to(device)
    torch_acts = torch.LongTensor(acts).to(device)
    torch_rewards = torch.FloatTensor(rewards).to(device)
    torch_next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).to(device)
    
    Q_next, _ = dqn_target.forward(torch_next_states, batch_size, episode_training_len, hidden_batch, cell_batch)
    Q_next_max, _ = Q_next.detach().max(dim = 2)
#     target_values = torch_rewards[:, episode_training_len - 1] + (gamma * Q_next_max) * (1 - dones[:, -1])
    target_values = torch_rewards + (gamma * Q_next_max) * (1 - dones)
    
    Q_s, _ = dqn_model.forward(torch_current_states, batch_size, episode_training_len, hidden_batch, cell_batch)
#     Q_s_a = Q_s.gather(dim=1,index=torch_acts[:, episode_training_len - 1].unsqueeze(dim = 1)).squeeze(dim = 1)
    Q_s_a = Q_s.gather(dim=2,index=torch_acts.unsqueeze(dim = 2)).squeeze(dim = 2)

    # testing that they share the same shapes
    assert (
        Q_s_a.shape == target_values.shape
    ), 'Shapes of values tensor and target_values tensor do not match.'    
    
    # testing that the value tensor requires a gradient,
    # and the target_values tensor does not
    assert Q_s_a.requires_grad, 'values tensor should require gradients'
    assert (
        not target_values.requires_grad
    ), 'target_values tensor should require gradients'

    loss = F.mse_loss(Q_s_a, target_values)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [9]:
def train_dqn(
    env,
    num_timesteps,
    *,
    replay_size,
    batch_size,
    exploration,
    gamma, 
    train_freq=1,
    print_freq=100,
    model_save_freq=500,
    target_network_update_freq=500,
    num_prepopulate_episode=100, 
    episode_training_len=8, 
    episode_max_len=30):
    """
    DQN algorithm.

    Compared to previous training procedures, we will train for a given number
    of time-steps rather than a given number of episodes.  The number of
    time-steps will be in the range of millions, which still results in many
    episodes being executed.

    Args:
        - env: The openai Gym environment
        - num_episodes: Total number of steps to be used for training
        - replay_size: Maximum size of the ReplayMemory
        - batch_size: Number of experiences in a batch
        - exploration: a ExponentialSchedule
        - gamma: The discount factor
        - train_freq:
        - print_freq:
        - target_network_update_freq:
        - num_prepopulate_episode:
        - t_time_steps
        - t_max_steps

    Returns: (saved_models, returns)
        - saved_models: Dictionary whose values are trained DQN models
        - returns: Numpy array containing the return of each training episode
        - lengths: Numpy array containing the length of each training episode
        - losses: Numpy array containing the loss of each training batch
    """
    # check that environment states are compatible with our DQN representation
    assert (
        isinstance(env.observation_space, gym.spaces.Box)
        and len(env.observation_space.shape) == 1
    )

    # initialize the DQN and DQN-target models
    dqn_model = DRQN().float().to(device)
    dqn_target = DRQN().float().to(device)

    dqn_target.load_state_dict(dqn_model.state_dict())

    # initialize the optimizer
    optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4)

    # initialize the replay memory and prepopulating with some episodes
    replay_buffer = RecurrentReplayMemory(replay_size, episode_training_len, episode_max_len)
    replay_buffer.pre_populate(env, num_prepopulate_episode)

    last_100ep_returns = deque(maxlen=100)
    last_100ep_lens = deque(maxlen=100)

    losses = []

    t_total_steps = 0
    t_episode = 0
    short_episode_cnt = 0
    good_episode_cnt = 0

    # start training for a number of time steps
    while t_total_steps < num_timesteps:

        prev_state = env.reset()
        # Extract the angle and cart position
        prev_state = prev_state[[0, 2]]

        currrent_episode = []

        episode_return = 0
        rewards = []
        episode_len_count = 0

        hidden_state, cell_state = dqn_model.init_hidden_states(bsize=1)

        while episode_len_count < episode_max_len:
            
            episode_len_count += 1
            t_total_steps += 1
            
            epsilon = exploration.value(t_total_steps)

            if np.random.rand() < epsilon:
                torch_x = torch.from_numpy(prev_state).float().to(device)
                model_out = dqn_model.forward(torch_x, 1, 1, hidden_state, cell_state)
                action = np.random.randint(0, env.action_space.n)
                hidden_state = model_out[1][0]
                cell_state = model_out[1][1]
                
            else:
                torch_x = torch.from_numpy(prev_state).float().to(device)
                model_out = dqn_model.forward(torch_x, 1, 1, hidden_state, cell_state)
                out = model_out[0]
                action = int(torch.argmax(out[0]))
                hidden_state = model_out[1][0]
                cell_state = model_out[1][1]
            
            next_state, reward, done, _ = env.step(action)
            # env.render()
            next_state = next_state[[0, 2]]
            
            rewards.append(reward)
            
            currrent_episode.append((prev_state, action, reward, next_state, done))
            
            prev_state = next_state            
            
            # Copy weights to the target network
            if (t_total_steps % target_network_update_freq) == 0:
                dqn_target.load_state_dict(dqn_model.state_dict())
        
            # Training
            if (t_total_steps % train_freq) == 0:
                batch = replay_buffer.sample(batch_size, episode_training_len)
                loss = train_drqn_batch(optimizer, batch, episode_training_len, dqn_model, dqn_target, gamma)
                losses.append(loss)

            # Debugging
            if t_total_steps % print_freq == 0:
                logger.record_tabular("steps", t_total_steps)
                logger.record_tabular("episodes", t_episode)
                logger.record_tabular("short episodes", short_episode_cnt)
                logger.record_tabular("good episodes", good_episode_cnt)

                logger.record_tabular("mean reward", np.mean(last_100ep_returns))
                logger.record_tabular("mean len", np.mean(last_100ep_lens))

                logger.record_tabular("% time spent exploring", int(100 * epsilon))
                logger.dump_tabular()

            # Saving models and losses
            if t_total_steps % model_save_freq == 0:     
                torch.save(dqn_model.state_dict(),'model_{}.torch'.format(t_total_steps))

                with open("loss.pkl", "wb") as fp:
                    pickle.dump(losses, fp)    
                
            if done:
                t_episode += 1

                for i in range(len(rewards)):
                    episode_return += rewards[i] * pow(gamma, i)

                last_100ep_lens.append(len(currrent_episode))
                last_100ep_returns.append(episode_return)
                
                break

        if len(currrent_episode) >= episode_training_len:
            replay_buffer.add_episode(currrent_episode)
            good_episode_cnt += 1
        else:
            short_episode_cnt += 1

In [None]:
num_timesteps = 100000
env="TwoBumps-v1"
seed = 0
final_epsilon = 0.02
exploration_fraction = 0.1
replay_size = 50000
batch_size = 32
target_network_update_freq = 500
num_prepopulate_episode = 100
print_freq = 100
model_save_freq = 10000
train_freq = 1
gamma = 0.99
episode_training_len = 8


env = gym.make(env)
set_global_seeds(seed)

# Exploration schedule         
exploration = LinearSchedule(1.0, final_epsilon, int(num_timesteps * exploration_fraction))

# Logging
today = datetime.today()
date = today.strftime('%d-%m-%Y')
time = today.strftime("%H:%M:%S")
logger.configure("~/logs/" + date + "/" + time + "/")


# Training
train_dqn(
    env,
    num_timesteps,
    replay_size=replay_size,
    batch_size=batch_size,
    exploration=exploration,
    gamma=gamma,
    train_freq=train_freq,
    print_freq=print_freq,
    model_save_freq=model_save_freq,
    target_network_update_freq=target_network_update_freq,
    num_prepopulate_episode=num_prepopulate_episode, 
    episode_training_len=episode_training_len
)

Logging to /home/hainh22/logs/28-12-2019/17:06:08/
Done pre-populated with 100 episodes
-------------------------------------
| % time spent exploring | 99       |
| episodes               | 4        |
| good episodes          | 3        |
| mean len               | 7.5      |
| mean reward            | 0        |
| short episodes         | 3        |
| steps                  | 100      |
-------------------------------------
-------------------------------------
| % time spent exploring | 98       |
| episodes               | 6        |
| good episodes          | 7        |
| mean len               | 7        |
| mean reward            | 0        |
| short episodes         | 4        |
| steps                  | 200      |
-------------------------------------
-------------------------------------
| % time spent exploring | 97       |
| episodes               | 8        |
| good episodes          | 11       |
| mean len               | 7.12     |
| mean reward            | 0        |


-------------------------------------
| % time spent exploring | 75       |
| episodes               | 43       |
| good episodes          | 90       |
| mean len               | 9.88     |
| mean reward            | 0        |
| short episodes         | 22       |
| steps                  | 2.5e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 74       |
| episodes               | 44       |
| good episodes          | 93       |
| mean len               | 9.8      |
| mean reward            | 0        |
| short episodes         | 23       |
| steps                  | 2.6e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 73       |
| episodes               | 45       |
| good episodes          | 97       |
| mean len               | 9.89     |
| mean reward            | 0        |
| short episodes         | 23       |
| steps                  | 2.7e+03  |
------------

-------------------------------------
| % time spent exploring | 51       |
| episodes               | 84       |
| good episodes          | 177      |
| mean len               | 9.2      |
| mean reward            | 0.031    |
| short episodes         | 44       |
| steps                  | 4.9e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 50       |
| episodes               | 89       |
| good episodes          | 183      |
| mean len               | 9.25     |
| mean reward            | 0.0395   |
| short episodes         | 45       |
| steps                  | 5e+03    |
-------------------------------------
-------------------------------------
| % time spent exploring | 50       |
| episodes               | 96       |
| good episodes          | 186      |
| mean len               | 9.16     |
| mean reward            | 0.0366   |
| short episodes         | 50       |
| steps                  | 5.1e+03  |
------------

-------------------------------------
| % time spent exploring | 28       |
| episodes               | 214      |
| good episodes          | 277      |
| mean len               | 8.01     |
| mean reward            | 0.103    |
| short episodes         | 119      |
| steps                  | 7.3e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 27       |
| episodes               | 217      |
| good episodes          | 282      |
| mean len               | 7.83     |
| mean reward            | 0.112    |
| short episodes         | 119      |
| steps                  | 7.4e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 26       |
| episodes               | 226      |
| good episodes          | 287      |
| mean len               | 7.82     |
| mean reward            | 0.112    |
| short episodes         | 125      |
| steps                  | 7.5e+03  |
------------

-------------------------------------
| % time spent exploring | 4        |
| episodes               | 314      |
| good episodes          | 374      |
| mean len               | 8.03     |
| mean reward            | 0.131    |
| short episodes         | 175      |
| steps                  | 9.7e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 3        |
| episodes               | 317      |
| good episodes          | 379      |
| mean len               | 7.91     |
| mean reward            | 0.141    |
| short episodes         | 176      |
| steps                  | 9.8e+03  |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 322      |
| good episodes          | 384      |
| mean len               | 8.27     |
| mean reward            | 0.159    |
| short episodes         | 177      |
| steps                  | 9.9e+03  |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 413      |
| good episodes          | 479      |
| mean len               | 9.28     |
| mean reward            | 0.379    |
| short episodes         | 218      |
| steps                  | 1.21e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 417      |
| good episodes          | 483      |
| mean len               | 9.28     |
| mean reward            | 0.369    |
| short episodes         | 221      |
| steps                  | 1.22e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 418      |
| good episodes          | 487      |
| mean len               | 9.3      |
| mean reward            | 0.369    |
| short episodes         | 221      |
| steps                  | 1.23e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 500      |
| good episodes          | 571      |
| mean len               | 10.2     |
| mean reward            | 0.189    |
| short episodes         | 264      |
| steps                  | 1.45e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 503      |
| good episodes          | 574      |
| mean len               | 9.91     |
| mean reward            | 0.172    |
| short episodes         | 267      |
| steps                  | 1.46e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 509      |
| good episodes          | 579      |
| mean len               | 9.91     |
| mean reward            | 0.172    |
| short episodes         | 270      |
| steps                  | 1.47e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 570      |
| good episodes          | 657      |
| mean len               | 9.01     |
| mean reward            | 0.112    |
| short episodes         | 308      |
| steps                  | 1.69e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 572      |
| good episodes          | 660      |
| mean len               | 8.81     |
| mean reward            | 0.112    |
| short episodes         | 310      |
| steps                  | 1.7e+04  |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 574      |
| good episodes          | 664      |
| mean len               | 9.1      |
| mean reward            | 0.12     |
| short episodes         | 310      |
| steps                  | 1.71e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 632      |
| good episodes          | 747      |
| mean len               | 10.8     |
| mean reward            | 0.141    |
| short episodes         | 335      |
| steps                  | 1.93e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 635      |
| good episodes          | 751      |
| mean len               | 11.1     |
| mean reward            | 0.15     |
| short episodes         | 335      |
| steps                  | 1.94e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 641      |
| good episodes          | 755      |
| mean len               | 11.2     |
| mean reward            | 0.15     |
| short episodes         | 339      |
| steps                  | 1.95e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 703      |
| good episodes          | 829      |
| mean len               | 8.84     |
| mean reward            | 0.0526   |
| short episodes         | 385      |
| steps                  | 2.17e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 705      |
| good episodes          | 832      |
| mean len               | 9.01     |
| mean reward            | 0.0526   |
| short episodes         | 386      |
| steps                  | 2.18e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 707      |
| good episodes          | 836      |
| mean len               | 8.78     |
| mean reward            | 0.0439   |
| short episodes         | 387      |
| steps                  | 2.19e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 783      |
| good episodes          | 911      |
| mean len               | 6.97     |
| mean reward            | 0.0279   |
| short episodes         | 444      |
| steps                  | 2.41e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 786      |
| good episodes          | 915      |
| mean len               | 7.28     |
| mean reward            | 0.0279   |
| short episodes         | 445      |
| steps                  | 2.42e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 787      |
| good episodes          | 918      |
| mean len               | 7.24     |
| mean reward            | 0.0279   |
| short episodes         | 446      |
| steps                  | 2.43e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 859      |
| good episodes          | 993      |
| mean len               | 8.41     |
| mean reward            | 0.0533   |
| short episodes         | 495      |
| steps                  | 2.65e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 864      |
| good episodes          | 996      |
| mean len               | 8.2      |
| mean reward            | 0.062    |
| short episodes         | 499      |
| steps                  | 2.66e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 868      |
| good episodes          | 1e+03    |
| mean len               | 8.19     |
| mean reward            | 0.062    |
| short episodes         | 502      |
| steps                  | 2.67e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 938      |
| good episodes          | 1.08e+03 |
| mean len               | 8.26     |
| mean reward            | 0.0505   |
| short episodes         | 550      |
| steps                  | 2.89e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 942      |
| good episodes          | 1.08e+03 |
| mean len               | 8.33     |
| mean reward            | 0.0505   |
| short episodes         | 552      |
| steps                  | 2.9e+04  |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 945      |
| good episodes          | 1.08e+03 |
| mean len               | 8.41     |
| mean reward            | 0.0505   |
| short episodes         | 553      |
| steps                  | 2.91e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.02e+03 |
| good episodes          | 1.16e+03 |
| mean len               | 8.62     |
| mean reward            | 0.0163   |
| short episodes         | 610      |
| steps                  | 3.13e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.02e+03 |
| good episodes          | 1.16e+03 |
| mean len               | 8.61     |
| mean reward            | 0.0163   |
| short episodes         | 612      |
| steps                  | 3.14e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.03e+03 |
| good episodes          | 1.16e+03 |
| mean len               | 8.47     |
| mean reward            | 0.0163   |
| short episodes         | 614      |
| steps                  | 3.15e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.09e+03 |
| good episodes          | 1.23e+03 |
| mean len               | 7.38     |
| mean reward            | 0        |
| short episodes         | 660      |
| steps                  | 3.37e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.09e+03 |
| good episodes          | 1.24e+03 |
| mean len               | 7.54     |
| mean reward            | 0        |
| short episodes         | 664      |
| steps                  | 3.38e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.09e+03 |
| good episodes          | 1.24e+03 |
| mean len               | 7.3      |
| mean reward            | 0        |
| short episodes         | 666      |
| steps                  | 3.39e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.13e+03 |
| good episodes          | 1.31e+03 |
| mean len               | 6.33     |
| mean reward            | 0.00878  |
| short episodes         | 700      |
| steps                  | 3.61e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.14e+03 |
| good episodes          | 1.32e+03 |
| mean len               | 6.31     |
| mean reward            | 0.00878  |
| short episodes         | 702      |
| steps                  | 3.62e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.14e+03 |
| good episodes          | 1.32e+03 |
| mean len               | 6.29     |
| mean reward            | 0.00878  |
| short episodes         | 703      |
| steps                  | 3.63e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.19e+03 |
| good episodes          | 1.39e+03 |
| mean len               | 6.28     |
| mean reward            | 0.0176   |
| short episodes         | 749      |
| steps                  | 3.85e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.19e+03 |
| good episodes          | 1.39e+03 |
| mean len               | 6.25     |
| mean reward            | 0.0176   |
| short episodes         | 750      |
| steps                  | 3.86e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.2e+03  |
| good episodes          | 1.4e+03  |
| mean len               | 6.1      |
| mean reward            | 0.00886  |
| short episodes         | 754      |
| steps                  | 3.87e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.27e+03 |
| good episodes          | 1.46e+03 |
| mean len               | 5.41     |
| mean reward            | 0.0347   |
| short episodes         | 820      |
| steps                  | 4.09e+04 |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.27e+03 |
| good episodes          | 1.47e+03 |
| mean len               | 5.47     |
| mean reward            | 0.0347   |
| short episodes         | 821      |
| steps                  | 4.1e+04  |
-------------------------------------
-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.28e+03 |
| good episodes          | 1.47e+03 |
| mean len               | 5.37     |
| mean reward            | 0.0258   |
| short episodes         | 824      |
| steps                  | 4.11e+04 |
------------

-------------------------------------
| % time spent exploring | 2        |
| episodes               | 1.34e+03 |
| good episodes          | 1.54e+03 |
| mean len               | 6.66     |
| mean reward            | 0.00878  |
| short episodes         | 875      |
| steps                  | 4.33e+04 |
-------------------------------------


### Evaluation of DRQN



In [None]:
# Loss plotting
from matplotlib import pyplot as plt
import numpy as np

with open('loss.pkl', 'rb') as fp:
    # read the data as binary data stream
    losses_list = pickle.load(fp)
    raw = np.array(losses_list)
    smooth = rolling_average(raw, window_size=100)

plt.plot(smooth, 'r')
plt.show()