In [None]:
#Playing Atari Gravitar with Double Dueling DQN with Prioritized Experienced Replay (PER) Buffer

#References to code are included as single line comments above the appropriate code block

In [None]:
import sys

!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install gym

import os
import random
from typing import Dict, List, Tuple
import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

if not os.path.exists("segment_tree.py"):
    #Download the segment tree module which is needed for PER implementation
    #https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/03.per.ipynb
    !wget https://raw.githubusercontent.com/curt-park/rainbow-is-all-you-need/master/segment_tree.py
        
from segment_tree import MinSegmentTree, SumSegmentTree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.8).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.


Prioritized Memory Replay - extension from standard Replay Memory buffer


In [None]:
#Merged with standard Replay Buffer into a single PER class
#https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/03.per.ipynb
class PrioritizedReplayBuffer():
    
    def __init__(self, obs_dim, size, batch_size = 32,alpha = 0.6):

        """Initialization."""
        super(PrioritizedReplayBuffer, self).__init__()

        self.obs_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.next_obs_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.acts_buf = np.zeros([size], dtype=np.float32)
        self.rews_buf = np.zeros([size], dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.max_size, self.batch_size = size, batch_size
        self.ptr, self.size, = 0, 0

        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha
        
        #Capacity must be positive and a power of 2
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)

    #For checking buffer size 
    def __len__(self):
        return self.size

    #Write to the buffer
    def store(self, obs, act, rew, next_obs, done):
      
        self.obs_buf[self.ptr] = obs
        self.next_obs_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

        self.sum_tree[self.tree_ptr] = self.max_priority ** self.alpha
        self.min_tree[self.tree_ptr] = self.max_priority ** self.alpha
        self.tree_ptr = (self.tree_ptr + 1) % self.max_size

    #Sample a batch of experiences
    def sample_batch(self, beta = 0.4):
        
        indices = self._sample_proportional()
        obs = self.obs_buf[indices]
        next_obs = self.next_obs_buf[indices]
        acts = self.acts_buf[indices]
        rews = self.rews_buf[indices]
        done = self.done_buf[indices]
        weights = np.array([self._calculate_weight(i, beta) for i in indices])
        
        return dict(obs=obs, next_obs=next_obs, acts=acts,rews=rews,done=done,weights=weights,indices=indices,
        )
        
    #Update transition priorities
    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.sum_tree[idx] = priority ** self.alpha
            self.min_tree[idx] = priority ** self.alpha
            self.max_priority = max(self.max_priority, priority)

    #Sample indices      
    def _sample_proportional(self):
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size
        
        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)
            
        return indices
    
    #Calculate the weight of an experience at the specified index
    def _calculate_weight(self, idx, beta):
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self)) ** (-beta)
        
        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self)) ** (-beta)
        weight = weight / max_weight
        
        return weight

Dueling network 

In [None]:
#https://github.com/higgsfield/RL-Adventure/blob/master/3.dueling%20dqn.ipynb
class DuelingCNN_Network(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        super(DuelingCNN_Network, self).__init__()

        # set common feature layer
        self.feature_layer = nn.Sequential(
            nn.Linear(in_dim, 128), 
            nn.ReLU(),
        )
        
        # set advantage layer
        self.advantage_layer = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, out_dim),
        )

        # set value layer
        self.value_layer = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )

    def forward(self, x):
        feature = self.feature_layer(x)
        
        value = self.value_layer(feature)
        advantage = self.advantage_layer(feature)

        return value + advantage  - advantage.mean(dim=-1, keepdim=True)
       

Agent

In [None]:
#Adapted from #https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/03.per.ipynb
class DQNAgent:

    def __init__(self, env, memory_size, batch_size, target_update, epsilon_decay, max_epsilon = 1.0, min_epsilon = 0.05, gamma = 0.98,

        # PER parameters
        alpha = 0.2,
        beta = 0.6,
        prior_eps = 1e-6,
    ):
       
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.env = env
        self.batch_size = batch_size
        self.epsilon = max_epsilon
        self.epsilon_decay = epsilon_decay #Rate of epsilon decay
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon #Epsilon decays down to min_epsilon
        self.target_update = target_update
        self.gamma = gamma #Learning rate
        
        #Run on GPU is available
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        
        # PER
        self.beta = beta
        self.prior_eps = prior_eps

        #Initialize prioritized memory replay (versus standard replay buffer)
        self.memory = PrioritizedReplayBuffer(
            obs_dim, memory_size, batch_size, alpha
        )

        #Dueling networks: dqn, dqn_target
        #Dueling architecture - #https://github.com/higgsfield/RL-Adventure/blob/master/3.dueling%20dqn.ipynb
        #https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/04.dueling.ipynb

        self.dqn = DuelingCNN_Network(obs_dim, action_dim).to(self.device)
        self.dqn_target = DuelingCNN_Network(obs_dim, action_dim).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()
        
        #Adam optimizer
        self.optimizer = optim.Adam(self.dqn.parameters())

        #State transition to write to the memory buffer
        self.transition = list()

    #Agent selects an action
    def select_action(self, state):
        #Explore vs. follow policy
        if self.epsilon > np.random.random():
            selected_action = self.env.action_space.sample()
        else:
            selected_action = self.dqn(
                torch.FloatTensor(state).to(self.device)
            ).argmax()
            selected_action = selected_action.detach().cpu().numpy()

        self.transition = [state, selected_action]
        
        return selected_action

    #Agent takes an action, stores it to memory, and returns environment response (reward)
    def step(self, action):
        next_state, reward, done, _ = self.env.step(action)
        self.transition += [reward, next_state, done]
        self.memory.store(*self.transition)
    
        return next_state, reward, done

    def update_model(self):
        # PER needs beta to calculate weights
        samples = self.memory.sample_batch(self.beta)
        weights = torch.FloatTensor(
            samples["weights"].reshape(-1, 1)
        ).to(self.device)
        indices = samples["indices"]

        #PER: importance sampling before average
        elementwise_loss = self._compute_dqn_loss(samples)
        loss = torch.mean(elementwise_loss * weights)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        return loss.item()

    #Main training loop    
    def train(self, num_frames):
        #Printing episode data to console 
        print_every = 1
        
        state = self.env.reset()
        update_cnt = 0
        #Initialize array for log printing
        marking  = []
        score = 0
        episode_count = 0 

        for frame_idx in range(1, num_frames + 1):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
            
            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            #When episode is completed log scores and reset variables
            if done:
                state = self.env.reset()
                marking.append(score)
                #Printing to logs
                if episode_count%100 == 0:
                  print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
                  episode_count, score, np.array(marking).mean(), np.array(marking).std()))
                  marking = []

                if episode_count%print_every==0 and episode_count!=0:
                  print("episode: {}, score: {:.1f}, epsilon: {:.2f}".format(episode_count, score, self.epsilon))
                
                score = 0
                episode_count +=1

            # if training is ready
            if len(self.memory) >= self.batch_size:
                loss = self.update_model()
                update_cnt += 1
                
                #Decrease epsilon
                self.update_epsilon()

                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self.dqn_target.load_state_dict(self.dqn.state_dict())
                          
        self.env.close()
    
    #Whether to pursue exploration
    def update_epsilon(self):
        self.epsilon = max(
                      self.min_epsilon, self.epsilon - (
                          self.max_epsilon - self.min_epsilon
                      ) * self.epsilon_decay
                  )
    
    def _compute_dqn_loss(self, samples):       

        state = torch.FloatTensor(samples["obs"]).to(self.device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(self.device)
        action = torch.LongTensor(samples["acts"].reshape(-1, 1)).to(self.device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(self.device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(self.device)

        # Make predictions
        state_q_values = self.dqn(state)
        next_states_q_values = self.dqn_target(next_state)

        #Double DQN - updating the DQN loss target as per https://github.com/Curt-Park/rainbow-is-all-you-need/blob/master/02.double_q.ipynb
        curr_q_value = state_q_values.gather(1, action)
        next_q_value = next_states_q_values.gather(1, self.dqn(next_state).argmax(dim=1, keepdim=True)).detach()
        target = (reward + self.gamma * next_q_value * (1 - done)).to(self.device)

        # calculate element-wise dqn loss
        elementwise_loss = F.smooth_l1_loss(curr_q_value, target, reduction="none")

        return elementwise_loss 
 

Set up environment and random seed


In [None]:
#Configure environment
env_id = "Gravitar-ram-v0"
env = gym.make(env_id)

video_every = 3 #Frequent videos to collect performance results; however this siginificantly slows the run time
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)

#Reproducible environment and action spaces, do not change lines 6-11 here (tools > settings > editor > show line numbers)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

[742]

Set parameters and begin training


In [None]:
#Set parameters
num_frames = 6500000 #An episode averages ~1000 frames; training for approx. 3000 episodes until a Google Colab timeout is encountered
memory_size = 1000000
batch_size = 32
target_update = 2000
#Epsilon initially decays slowly so as to encourage exploration of alternative game strategies
epsilon_decay = 1/10000

In [None]:
#Begin training the agent
agent = DQNAgent(env, memory_size, batch_size, target_update, epsilon_decay)
agent.train(num_frames)

marking, episode: 0, score: 500.0, mean_score: 500.00, std_score: 0.00
episode: 1, score: 250.0, epsilon: 0.82
episode: 2, score: 350.0, epsilon: 0.75
episode: 3, score: 450.0, epsilon: 0.64
episode: 4, score: 100.0, epsilon: 0.56
episode: 5, score: 100.0, epsilon: 0.49
episode: 6, score: 100.0, epsilon: 0.41
episode: 7, score: 0.0, epsilon: 0.34
episode: 8, score: 350.0, epsilon: 0.25
episode: 9, score: 100.0, epsilon: 0.17
episode: 10, score: 500.0, epsilon: 0.10
episode: 11, score: 250.0, epsilon: 0.05
episode: 12, score: 500.0, epsilon: 0.05
episode: 13, score: 0.0, epsilon: 0.05
episode: 14, score: 0.0, epsilon: 0.05
episode: 15, score: 0.0, epsilon: 0.05
episode: 16, score: 250.0, epsilon: 0.05
episode: 17, score: 350.0, epsilon: 0.05
episode: 18, score: 500.0, epsilon: 0.05
episode: 19, score: 0.0, epsilon: 0.05
episode: 20, score: 0.0, epsilon: 0.05
episode: 21, score: 250.0, epsilon: 0.05
episode: 22, score: 0.0, epsilon: 0.05
episode: 23, score: 0.0, epsilon: 0.05
episode: 24

In [None]:
from google.colab import drive
drive.mount('/content/drive')