In [147]:
import sys
sys.path.append('../r2d2_algo/')
import numpy as np
from gym import spaces
import torch
from torch import nn
from segment_tree import SumSegmentTree, MinSegmentTree
import random
import torch
from torch import nn
import torch.nn.functional as F
from model import RNNQNetwork, linear_schedule
from storage import ContinuousSequenceReplayBuffer, SequenceReplayBuffer
from envs import make_vec_envs
import torch.optim as optim
import random
import numpy as np
import gym
import gym_nav
import time

    
    
    
def get_action_dim(action_space):
    """
    Get the dimension of the action space.
    """
    if isinstance(action_space, spaces.Box):
        return int(np.prod(action_space.shape))
    elif isinstance(action_space, spaces.Discrete):
        # Action is an int
        return 1
    elif isinstance(action_space, spaces.MultiDiscrete):
        # Number of discrete actions
        return int(len(action_space.nvec))
    elif isinstance(action_space, spaces.MultiBinary):
        # Number of binary actions
        assert isinstance(
            action_space.n, int
        ), "Multi-dimensional MultiBinary action space is not supported. You can flatten it instead."
        return int(action_space.n)
    else:
        raise NotImplementedError(f"{action_space} action space is not supported")

In [345]:
  
class SequenceReplayBuffer:
    def __init__(self, buffer_size, observation_space,
                 action_space, hidden_state_size, sequence_length=8,
                 burn_in_length=4, n_envs=1,
                 alpha=0.6, beta=0.4, 
                 beta_increment=0.0001, max_priority=1.0):
        '''
        A replay buffer for R2D2 algorithm that when sampled, produces sequences of time steps.
        Will continually store steps until a done is received or max sequence length is reached
            then store that sequence into the replay buffer
          
        self.pos keeps track of the next index to be written to. When it reaches the end 
          it loops back to the start.
        
        buffer_size: number of sequences to hold in buffer
        sequence_length: number of steps in sequence
        burn_in_length: number of steps before idx to be passed with sequence
        '''
        self.buffer_size = buffer_size
        self.bil_sl = burn_in_length + sequence_length
        total_buffer_size = buffer_size + sequence_length + burn_in_length
        self.n_envs = n_envs
        self.sequence_length = sequence_length
        self.burn_in_length = burn_in_length
        self.alpha = alpha
        self.beta = beta
        self.beta_increment = beta_increment
        self.max_priority = max_priority

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < buffer_size:
            tree_capacity *= 2
        self.sum_tree = SumSegmentTree(tree_capacity) #trees hold actual priorities for faster updating and sampling
        self.min_tree = MinSegmentTree(tree_capacity)
        
        # buffer shape [buffer_size, sequence_length, data_dim]
        #  note that we add to the buffer regardless of which env the sequence comes from
        
        action_shape = get_action_dim(action_space)
        self.observations = np.zeros((buffer_size, self.bil_sl, *observation_space.shape), dtype=observation_space.dtype)
        self.next_observations = np.zeros((buffer_size, self.bil_sl, *observation_space.shape), dtype=observation_space.dtype)
        self.actions = np.zeros((buffer_size, self.bil_sl, action_shape), dtype=action_space.dtype)
        self.rewards = np.zeros((buffer_size, self.bil_sl), dtype=np.float32)
        self.dones = np.zeros((buffer_size, self.bil_sl), dtype=np.float32)
        self.hidden_states = np.zeros((buffer_size, self.bil_sl, hidden_state_size), dtype=np.float32)
        self.next_hidden_states = np.zeros((buffer_size, self.bil_sl, hidden_state_size), dtype=np.float32)
        # training_masks is used to keep track of which steps are trainable
        #  note that it only has length sequence_length, as opposed to bil+sl
        self.training_masks = np.zeros((buffer_size, self.sequence_length), dtype=np.float32)

        self.cur_observations = np.zeros((n_envs, self.bil_sl, *observation_space.shape), dtype=observation_space.dtype)
        self.cur_next_observations = np.zeros((n_envs, self.bil_sl, *observation_space.shape), dtype=observation_space.dtype)
        self.cur_actions = np.zeros((n_envs, self.bil_sl, action_shape), dtype=action_space.dtype)
        self.cur_rewards = np.zeros((n_envs, self.bil_sl), dtype=np.float32)
        self.cur_dones = np.zeros((n_envs, self.bil_sl), dtype=np.float32)
        self.cur_hidden_states = np.zeros((n_envs, self.bil_sl, hidden_state_size), dtype=np.float32)
        self.cur_next_hidden_states = np.zeros((n_envs, self.bil_sl, hidden_state_size), dtype=np.float32)

        self.pos = 0
        
        self.cur_pos = np.zeros(n_envs, dtype='long') # keep track of which environments are done
        self.full = False
        
        
    def add(self, obs, next_obs, action, reward, done, hidden_state, next_hidden_state):
        '''
        Add to the buffer. Each incoming input should be of shape
            [n_envs, data_dim]
        '''
        for i in range(self.n_envs):
            self.cur_observations[i, self.cur_pos[i]] = np.array(obs[i]).copy()
            self.cur_next_observations[i, self.cur_pos[i]] = np.array(next_obs[i]).copy()
            self.cur_actions[i, self.cur_pos[i]] = np.array(action[i]).copy()
            self.cur_rewards[i, self.cur_pos[i]] = np.array(reward[i]).copy()
            self.cur_dones[i, self.cur_pos[i]] = np.array(done[i]).copy()
            self.cur_hidden_states[i, self.cur_pos[i]] = np.array(hidden_state[:, i, :]).copy()
            self.cur_next_hidden_states[i, self.cur_pos[i]] = np.array(next_hidden_state[:, i, :]).copy()
            self.cur_pos[i] += 1

            if done[i] or self.cur_pos[i] == self.bil_sl:
                # copy the sequence to the buffer
                self.observations[self.pos] = self.cur_observations[i]
                self.next_observations[self.pos] = self.cur_next_observations[i]
                self.actions[self.pos] = self.cur_actions[i]
                self.rewards[self.pos] = self.cur_rewards[i]
                self.dones[self.pos] = self.cur_dones[i]
                self.hidden_states[self.pos] = self.cur_hidden_states[i]
                self.next_hidden_states[self.pos] = self.cur_next_hidden_states[i]

                trainable_steps = self.cur_pos[i] - self.burn_in_length
                training_mask = np.zeros((self.sequence_length,))
                training_mask[:trainable_steps] = 1
                self.training_masks[self.pos] = training_mask
                                
                # make copy of the last steps to carry over to next sequence
                copy_steps = min(self.cur_pos[i], self.burn_in_length)
                
                # only copy if not done. If done, start a new sequence up
                if not done[i]:
                    self.cur_observations[i, :copy_steps] = self.cur_observations[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_next_observations[i, :copy_steps] = self.cur_next_observations[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_actions[i, :copy_steps] = self.cur_actions[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_rewards[i, :copy_steps] = self.cur_rewards[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_dones[i, :copy_steps] = self.cur_dones[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_hidden_states[i, :copy_steps] = self.cur_hidden_states[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                    self.cur_next_hidden_states[i, :copy_steps] = self.cur_next_hidden_states[i, self.cur_pos[i]-copy_steps:self.cur_pos[i]]
                else:
                    copy_steps = 0
                
                self.cur_observations[i, copy_steps:] = 0.
                self.cur_next_observations[i, copy_steps:] = 0.
                self.cur_actions[i, copy_steps:] = 0.
                self.cur_rewards[i, copy_steps:] = 0.
                self.cur_dones[i, copy_steps:] = 0.
                self.cur_hidden_states[i, copy_steps:] = 0.
                self.cur_next_hidden_states[i, copy_steps:] = 0.
                
                self.cur_pos[i] = copy_steps
                
                self.sum_tree[self.pos] = self.max_priority ** self.alpha
                self.min_tree[self.pos] = self.max_priority ** self.alpha
                
                self.pos = (self.pos + 1) % self.buffer_size

                
            
    def _sample_proportional(self, num_sequences):
        '''
        Use sum tree to sample indices from priorities in segments
        '''
        indices = []
        p_total = self.sum_tree.sum()
        segment = p_total / num_sequences
        
        # Check if stratified sampling will be valid based on number
        #  of sequences asked for and fullness of storage        
        for i in range(num_sequences):
            a = segment * i
            b = segment * (i + 1)
            
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)
                    
        return indices
    
    
    def _sample_uniform(self, num_sequences=1):
        '''
        Use sum tree to get n sample indices without stratified sampling
        '''
        indices = []
        p_total = self.sum_tree.sum()
        total_trainable_steps = 0

        for i in range(num_sequences):
            upperbound = random.uniform(0, p_total)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)
            total_trainable_steps += (self.training_masks[idx].sum())

        return indices, total_trainable_steps
        
        
    def _calculate_weight(self, idx):
        '''Calculate the weight of the experience at idx.'''
        # print(t_idx, env_idx)
        
        size = self.buffer_size if self.full else self.pos
        size = size * self.n_envs
        
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * size) ** (-self.beta)
        
        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * size) ** (-self.beta)
        weight = weight / max_weight
        
        return weight
        
        
    def sample(self, num_steps=None, num_sequences=None):
        '''
        Generate a sample, either by number of sequences or number of steps
        Given a number of sequences, use stratified sampling
        '''
        if num_sequences is not None:
            idxs = self._sample_indices(num_sequences)
        elif num_steps is not None:
            idxs = []
            num_fails = 0
            trainable_steps_batched = 0
            while trainable_steps_batched < num_steps and num_fails < 50:
                new_idxs, n_trainable_steps = self._sample_uniform(1)
                idxs += new_idxs
                if n_trainable_steps <= 0:
                    num_fails += 1
                else:
                    trainable_steps_batched += n_trainable_steps
            
            if num_fails >= 50:
                print('Warning - sampling failed 50 times')
        else:
            raise Exception('One of num_steps or num_sequences must be given')
            
        
        # weights are [N, 1] tensor to be multiplied to each sequence batch generaated
        weights = torch.Tensor([self._calculate_weight(idxs[i]) \
                                for i in range(len(idxs))]).reshape(-1, 1)

        self.beta = min(1.0, self.beta + self.beta_increment)
                
        obs = torch.Tensor(self.observations[idxs])
        next_obs = torch.Tensor(self.next_observations[idxs])
        actions = torch.Tensor(self.actions[idxs])
        rewards = torch.Tensor(self.rewards[idxs])
        dones = torch.Tensor(self.dones[idxs])
        next_dones = torch.Tensor(self.dones[idxs])
        hidden_states = torch.Tensor(self.hidden_states[idxs, 0, :]).unsqueeze(0)
        next_hidden_states = torch.Tensor(self.next_hidden_states[idxs, 0, :]).unsqueeze(0)
        training_masks = torch.Tensor(self.training_masks[idxs])
        
        sample = {
            'observations': obs,
            'next_observations': next_obs,
            'actions': actions,
            'rewards': rewards,
            'dones': dones,
            'next_dones': next_dones,
            'hidden_states': hidden_states,
            'next_hidden_states': next_hidden_states,
            'training_masks': training_masks,
            'weights': weights,
            'idxs': idxs,
        }
        
        return sample
    

    def update_priorities(self, idxs, priorities):
        '''
        idxs: shape [N,]
        priorities: shape [N,]
        '''
        
        assert len(idxs) == len(priorities)

        for idx, priority in zip(idxs, priorities):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority ** self.alpha
            self.min_tree[idx] = priority ** self.alpha

            self.max_priority = max(self.max_priority, priority)


    def __len__(self):
        if self.full:
            return self.buffer_size
        else:
            return self.pos

In [364]:
class R2D2Agent(nn.Module):
    def __init__(self, batch_size=128, burn_in_length=4, sequence_length=8,
                 gamma=0.99, tau=1., learning_rate=2.5e-4, hidden_size=64, adam_epsilon=1e-8,
                 device=torch.device('cpu'), buffer_size=10_000, 
                 learning_starts=10_000, train_frequency=10, target_network_frequency=500,
                 total_timesteps=30_000, start_e=1., end_e=0.05, exploration_fraction=0.5, 
                 alpha=0.6, beta=0.4,
                 seed=None, n_envs=1, dummy_env=True,
                 env_id='CartPole-v1', env_kwargs={},
                 verbose=0, q_network=None,  deterministic=False, env=None,
                 writer=None, handle_target_network=True):
        """
        R2D2 setup following same parameters as args.py has
        verbose: Level of verbosity of print statements
            1: print episode lengths and returns means every 2000 steps
            2: print every episode length and return
        q_network: Mostly for use of evaluation with a saved q_network
          optionally pass in a q_network to use manually
        deterministic: If True, manually set epsilon to 0 for every act() call
        env: Also option to manually pass in an environment
        n_envs: option to make multiple envs and have q_network generate multiple
        dummy_env: whether to use DummyVecEnv as opposed to SubprocVecEnv for testing
        writer: option to pass a tensorboard SummaryWriter object
        handle_target_network: whether this class is in charge of updating the target network
            params
        """
        
        super().__init__()
        
        self.learning_rate = learning_rate
        self.buffer_size = buffer_size
        self.total_timesteps = total_timesteps
        self.learning_starts = learning_starts
        self.train_frequency = train_frequency
        self.gamma = gamma
        self.tau = tau
        self.adam_epsilon = adam_epsilon
        self.target_network_frequency = target_network_frequency
        self.handle_target_network = handle_target_network
        self.device = device

        self.start_e = start_e
        self.end_e = end_e
        self.exploration_fraction = exploration_fraction

        self.burn_in_length = burn_in_length
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.n_envs = n_envs
        self.hidden_size = hidden_size

        self.seed = seed
        self.deterministic = deterministic
        if env == None:
            # self.env = gym.make(env_id, **env_kwargs)
            self.env = make_vec_envs(env_id, n_envs, env_kwargs=env_kwargs,
                                     dummy=dummy_env)
        else:
            self.env = env
        
        
        if q_network == None:
            self.q_network = RNNQNetwork(self.env, hidden_size).to(device)
        else:
            self.q_network = q_network
        self.target_network = RNNQNetwork(self.env, hidden_size).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate, eps=adam_epsilon)
        
        # self.rb = ContinuousSequenceReplayBuffer(buffer_size, self.env.observation_space, self.env.action_space,
        #                         hidden_size, sequence_length=sequence_length, 
        #                         burn_in_length=burn_in_length, n_envs=n_envs,
        #                         alpha=alpha, beta=beta)
        self.rb = SequenceReplayBuffer(buffer_size, self.env.observation_space, self.env.action_space,
                                hidden_size, sequence_length=sequence_length, 
                                burn_in_length=burn_in_length, n_envs=n_envs,
                                alpha=alpha, beta=beta)

        
        self.global_step = 0
        self.global_update_step = 0
        self.rnn_hxs = self.q_network.get_rnn_hxs(self.n_envs)
        self.obs = self.env.reset()
        self.masks = torch.zeros((self.n_envs, 1), dtype=torch.float32)
        
        self.cur_episode_t = np.zeros(self.n_envs)
        self.cur_episode_r = np.zeros(self.n_envs)
        
        self.verbose = verbose
        self.writer = writer
        self.start_time = time.time()
        self.lengths = []
        self.returns = []
        
    
    def act(self, obs, rnn_hxs, use_epsilon=True, masks=None):
        """Compute q values and sample policy. If epsilon is True,
        perform randomo action with probability based on current global timestep
        
        masks: tensor of shape (N, 1) which has entries 0.0 when done
            and 1.0 when not done, indicating when rnn_hxs should be reset
            Used for vectorized environments
        """            
        epsilon = self.get_epsilon(use_epsilon)
        
        obs_tensor = torch.Tensor(obs).to(self.device)
        if obs_tensor.dim() < rnn_hxs.dim():
            # We have an observation from the environment but need to unsqueeze
            #  to tell the GRU that this is an observation of time length 1
            # If it is batched (dim == 2), then we add an axis in the middle
            #  otherwise add it to the start
            if obs_tensor.dim() == 1:
                obs_tensor = obs_tensor.unsqueeze(0)
                action_dim = 1
            elif obs_tensor.dim() == 2:
                obs_tensor = obs_tensor.unsqueeze(1)
                action_dim = 2
        
        else:
            if obs_tensor.dim() == 2:
                action_dim = 1
            elif obs_tensor.dim() == 3:
                action_dim = 2  
            
        q_values, gru_out, next_rnn_hxs = self.q_network(obs_tensor, rnn_hxs, masks=masks)
                
        
        # action = np.array([[q_values.argmax()]])
        action = q_values.argmax(dim=action_dim).numpy()
        if use_epsilon:
            if len(action.shape) == 1:
                for i in range(action.shape[0]):
                    if random.random() < epsilon:
                        action[i] = self.env.action_space.sample()
            elif len(action.shape) == 2:
                for i in range(action.shape[0]):
                    for j in range(action.shape[1]):
                        if random.random() < epsilon:
                            action[i, j] = self.env.action_space.sample()
            
        if len(action.shape) == 1:
            action = action[np.newaxis, :]

        return action, q_values, next_rnn_hxs
                
        
    def collect(self, num_steps):
        """Perform policy for n steps and add to memory buffer
        
        Note that we will add a total of num_steps * self.n_envs to the buffer"""
        env = self.env
        
        for t in range(num_steps):
            action, q_values, next_rnn_hxs = self.act(self.obs, self.rnn_hxs, masks=self.masks)
            next_obs, reward, done, info = env.step(action)
            
            self.cur_episode_r += reward
            self.cur_episode_t += 1
            
            # Masks are used to reset hidden state when vectorized environmnts give dones
            self.masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])

            for i, done_ in enumerate(done):
                if done_:
                    if self.verbose == 2:
                        print(f'Episode R: {self.cur_episode_r[i]}, L: {self.cur_episode_t[i]}')
                        
                    if self.writer is not None:
                        self.writer.add_scalar('charts/episodic_return', self.cur_episode_r[i], self.global_step)
                        self.writer.add_scalar('charts/episodic_length', self.cur_episode_t[i], self.global_step)
                        self.writer.add_scalar('charts/epsilon', self.get_epsilon(), self.global_step)

                    self.lengths.append(self.cur_episode_t[i])
                    self.returns.append(self.cur_episode_r[i])
                    
                    self.cur_episode_r[i] = 0
                    self.cur_episode_t[i] = 0
                    

            # for ContinuousSequenceReplayBuffer
            # self.rb.add(self.obs, next_obs, action, reward, done, self.rnn_hxs.detach())
            self.rb.add(self.obs, next_obs, action, reward, done, self.rnn_hxs.detach(), next_rnn_hxs.detach())
            
            self.obs = next_obs
            self.rnn_hxs = next_rnn_hxs
            
            self.global_step += self.n_envs
            
            if self.handle_target_network and self.global_step > self.learning_starts and \
                self.global_step % self.target_network_frequency < self.n_envs:
                for target_network_param, q_network_param in zip(self.target_network.parameters(), self.q_network.parameters()):
                    target_network_param.data.copy_(
                        self.tau * q_network_param.data + (1 - self.tau) * target_network_param.data
                    )
            
            if self.global_step % 2000 < self.n_envs:
                if self.verbose == 1:
                    print(f'Mean episode length {np.mean(self.lengths)}, mean return {np.mean(self.returns)}')
                self.lengths = []
                self.returns = []

                
            
    
    def update(self):
        """Sample from buffer and perform Q-learning"""
        
        sample = self.rb.sample(self.batch_size//self.sequence_length)
        states = sample['observations']
        next_states = sample['next_observations']
        hidden_states = sample['hidden_states']
        next_hidden_states = sample['next_hidden_states']
        actions = sample['actions']
        rewards = sample['rewards']
        dones = sample['dones']
        next_dones = sample['next_dones']
        # training masks come from SequenceReplayBuffer and tell us which
        #  steps in each sequence actually are viable for training
        training_masks = sample['training_masks']
        
        with torch.no_grad():
            target_q, _, _ = self.target_network(next_states, next_hidden_states, next_dones)
            target_max, _ = target_q.max(dim=2)
            td_target = rewards + self.gamma * target_max * (1 - dones)
        old_q, _, _ = self.q_network(states, hidden_states, dones)
        old_val = old_q.gather(2, actions.long()).squeeze()

        # loss = F.mse_loss(td_target[:, self.burn_in_length:], old_val[:, self.burn_in_length:])
        weights = sample['weights']
        elementwise_loss = F.smooth_l1_loss(td_target[:, self.burn_in_length:],
                                            old_val[:, self.burn_in_length:], reduction='none')
        # loss = torch.mean(elementwise_loss * weights)
        loss = torch.mean(elementwise_loss * weights * training_masks)
                
        if self.writer is not None and self.global_update_step % 10 == 0:
            self.writer.add_scalar('losses/td_loss', loss, self.global_step)
            self.writer.add_scalar('losses/q_values', old_val.mean().item(), self.global_step)
            sps = int(self.global_step / (time.time() - self.start_time))
            # print('SPS:', int(sps))
            self.writer.add_scalar('charts/SPS', sps, self.global_step)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # PER: update priorities
        # for ContinuousSequenceReplayBuffer
        # td_priorities = elementwise_loss.detach().cpu().numpy() + 1e-6
        # self.rb.update_priorities(sample['seq_idxs'][:, self.burn_in_length:],
        #                           sample['env_idxs'], td_priorities)
        td_priorities = elementwise_loss.mean(dim=1).detach().cpu().numpy() + 1e-6
        self.rb.update_priorities(sample['idxs'], td_priorities)

        self.global_update_step += 1
    

    def train(self, n_updates):
        if self.global_step < self.learning_starts:
            self.collect((self.learning_starts - self.global_step) // self.n_envs + 1)
        
        for i in range(n_updates):
            self.collect(self.train_frequency)
            self.update()


    def get_rnn_hxs(self):
        return self.q_network.get_rnn_hxs(self.n_envs)
    
    def get_epsilon(self, use_epsilon=True):
        if use_epsilon:
            epsilon = linear_schedule(self.start_e, self.end_e, 
                        self.exploration_fraction*self.total_timesteps,
                        self.global_step)
        else:
            epsilon = 0
            
        return epsilon

In [365]:
env_kwargs = {
        'num_objects': 0, 'rew_structure': 'goal',
        'task_structure': 2, 'wall_colors': 4,
        'num_rays': 12, 'fov': 1
}
env = gym.make('NavEnv-v0', **env_kwargs)
agent = R2D2Agent(env_id='NavEnv-v0', env_kwargs=env_kwargs,
                 verbose=1, buffer_size=5000, alpha=0.6, batch_size=256,
                 burn_in_length=4, n_envs=4, dummy_env=True)

In [366]:
agent.collect(1000)

Mean episode length 202.0, mean return 0.0
Mean episode length 202.0, mean return 0.0


In [286]:
sample = agent.rb.sample(256)

In [287]:
sample['observations'].shape

torch.Size([37, 16, 24])

In [144]:

action, q_values, next_rnn_hxs = agent.act(agent.obs, agent.rnn_hxs, masks=agent.masks)
env = agent.env
next_obs, reward, done, info = env.step(action)


In [122]:
%%timeit
agent.collect(1000)

Mean episode length 202.0, mean return 0.0
Mean episode length 179.16666666666666, mean return 0.16666666666666666
Mean episode length 202.0, mean return 0.0
Mean episode length 202.0, mean return 0.0
897 ms ± 22.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [165]:
%%timeit
agent.collect(8)

Mean episode length 202.0, mean return 0.0
Mean episode length 202.0, mean return 0.0
Mean episode length 202.0, mean return 0.0
6.85 ms ± 279 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [367]:
%%timeit
agent.collect(2)

Mean episode length 202.0, mean return 0.0
Mean episode length 202.0, mean return 0.0
Mean episode length 194.25, mean return 0.08333333333333333
3.05 ms ± 258 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [368]:
%%timeit
agent.update()

8.46 ms ± 738 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [369]:
1000/12*8

666.6666666666666

In [36]:
%%timeit
sample = agent.rb.sample(256)

3.2 ms ± 156 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [37]:
%%timeit
sample = agent.rb.sample(32)

511 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [279]:
agent.update()

In [280]:
sample = agent.rb.sample(agent.batch_size//agent.sequence_length)
states = sample['observations']
next_states = sample['next_observations']
hidden_states = sample['hidden_states']
next_hidden_states = sample['next_hidden_states']
actions = sample['actions']
rewards = sample['rewards']
dones = sample['dones']
next_dones = sample['next_dones']
#training_masks are given by SequenceReplayBuffer
training_masks = sample['training_masks']

with torch.no_grad():
    target_q, _, _ = agent.target_network(next_states, next_hidden_states, next_dones)
    target_max, _ = target_q.max(dim=2)
    td_target = rewards + agent.gamma * target_max * (1 - dones)
old_q, _, _ = agent.q_network(states, hidden_states, dones)
old_val = old_q.gather(2, actions.long()).squeeze()

# loss = F.mse_loss(td_target[:, agent.burn_in_length:], old_val[:, agent.burn_in_length:])
weights = sample['weights']
elementwise_loss = F.smooth_l1_loss(td_target[:, agent.burn_in_length:],
                                    old_val[:, agent.burn_in_length:], reduction='none')
# loss = torch.mean(elementwise_loss * weights)
loss = torch.mean(elementwise_loss * weights * training_masks)

if agent.writer is not None and agent.global_update_step % 10 == 0:
    agent.writer.add_scalar('losses/td_loss', loss, agent.global_step)
    agent.writer.add_scalar('losses/q_values', old_val.mean().item(), agent.global_step)
    sps = int(agent.global_step / (time.time() - agent.start_time))
    # print('SPS:', int(sps))
    agent.writer.add_scalar('charts/SPS', sps, agent.global_step)

agent.optimizer.zero_grad()
loss.backward()
agent.optimizer.step()

# PER: update priorities
td_priorities = elementwise_loss.mean(dim=1).detach().cpu().numpy() + 1e-6
agent.rb.update_priorities(sample['idxs'], td_priorities)

In [282]:
dones

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [281]:
training_masks

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [356]:
agent.collect(1)

In [357]:
agent.rb.cur_dones

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [358]:
agent.rb.cur_observations[:, :, 5:10]

array([[[0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        ,

In [359]:
agent.rb.cur_pos

array([ 0,  8, 12, 12])

In [360]:
agent.rb.pos

362

In [363]:
agent.rb.observations[360][:, 5:10]

array([[0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.33333334, 0.33333334, 0.33333334],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.33333334, 0.33333334],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667],
       [0.        , 0.        , 0.

In [342]:
agent.rb.dones[367]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
      dtype=float32)

In [267]:
agent.rb.training_masks[547]

array([1., 1., 1., 1., 1., 1., 0., 0.], dtype=float32)

In [257]:
agent.rb.burn_in_length

8

In [199]:
states[-1]

tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.8333, 0.8333, 0.8333, 0.8333,
         0.8333, 0.8333, 0.8333, 0.3611, 0.3736, 0.3897, 0.4103, 0.4364, 0.4024,
         0.3713, 0.3468, 0.3275, 0.3123, 0.3005, 0.2914],
        [0.5000, 0.5000, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333,
         0.8333, 0.8333, 0.8333, 0.3974, 0.4200, 0.4253, 0.3890, 0.3608, 0.3386,
         0.3210, 0.3072, 0.2965, 0.2885, 0.2828, 0.2792],
        [0.5000, 0.5000, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333,
         0.8333, 0.8333, 0.8333, 0.3974, 0.4200, 0.4253, 0.3890, 0.3608, 0.3386,
         0.3210, 0.3072, 0.2965, 0.2885, 0.2828, 0.2792],
        [0.5000, 0.5000, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333,
         0.8333, 0.8333, 0.8333, 0.3835, 0.4054, 0.3941, 0.3605, 0.3343, 0.3137,
         0.2975, 0.2847, 0.2748, 0.2673, 0.2621, 0.2587],
        [0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333, 0.8333,
         0.8333, 0.8333, 0.8333, 0.3796

In [192]:
training_masks * elementwise_loss

tensor([[2.9684e-04, 3.5871e-04, 3.9544e-04, 6.3365e-07, 4.3067e-04, 4.5668e-04,
         4.9324e-04, 3.5182e-08],
        [3.0544e-03, 3.2638e-03, 1.2259e-04, 1.1976e-04, 3.3821e-03, 5.0191e-04,
         3.3470e-03, 6.1449e-07],
        [1.9314e-04, 1.4818e-04, 7.3708e-04, 1.5522e-07, 6.0924e-03, 5.0173e-03,
         0.0000e+00, 0.0000e+00],
        [7.9361e-04, 3.8188e-07, 5.1485e-03, 5.1389e-03, 7.6737e-07, 4.6772e-03,
         0.0000e+00, 0.0000e+00],
        [9.4599e-05, 8.6761e-05, 1.0171e-04, 1.2294e-04, 8.1208e-04, 4.8006e-03,
         0.0000e+00, 0.0000e+00]], grad_fn=<MulBackward0>)

In [191]:
elementwise_loss

tensor([[2.9684e-04, 3.5871e-04, 3.9544e-04, 6.3365e-07, 4.3067e-04, 4.5668e-04,
         4.9324e-04, 3.5182e-08],
        [3.0544e-03, 3.2638e-03, 1.2259e-04, 1.1976e-04, 3.3821e-03, 5.0191e-04,
         3.3470e-03, 6.1449e-07],
        [1.9314e-04, 1.4818e-04, 7.3708e-04, 1.5522e-07, 6.0924e-03, 5.0173e-03,
         1.4288e-04, 1.4209e-04],
        [7.9361e-04, 3.8188e-07, 5.1485e-03, 5.1389e-03, 7.6737e-07, 4.6772e-03,
         1.7489e-04, 1.5157e-04],
        [9.4599e-05, 8.6761e-05, 1.0171e-04, 1.2294e-04, 8.1208e-04, 4.8006e-03,
         1.3216e-04, 1.3028e-04]], grad_fn=<SmoothL1LossBackward0>)

In [104]:
len(sample['idxs'])

16

In [67]:
sample['idxs']

[123, 19, 16, 46, 43, 14, 93, 73, 30, 127, 123, 68, 122, 39, 13, 84]

In [69]:
agent.rb.pos

127

In [79]:
cap = agent.rb.sum_tree.capacity
st = agent.rb.sum_tree
st.tree[cap+126:130+cap]

[0.018921368084995137, 1.0, 0.0, 0.0]

In [50]:
elementwise_loss.mean(dim=1)

torch.Size([16])

In [39]:
agent.rb.update_priorities

AttributeError: 'SequenceReplayBuffer' object has no attribute 'update_priorities'