In [None]:
# Import all gym related libraries
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
# Take action
import random
import math

In [None]:
# Define the Show state function
env = gym.make('SpaceInvaders-v0')

In [None]:
observation_space = env.observation_space
action_space = env.action_space

In [None]:
# Print observation space is 
print("Observation Space is :",observation_space)
print("Number of actions is :",action_space.n)

In [None]:
import torch

In [None]:
torch.cuda.is_available()

In [None]:
# define libraries to pre-process image
from skimage import transform
from skimage.color import rgb2gray

In [None]:
def process_frame(frame) :

    # 1. convert image from rgb to gray
    gray_image = rgb2gray(frame)

    # 2. Crop the image
    cropped_image = gray_image[0:-12,4:-12]

    # 3. Normalized frame
    normalized_frame = cropped_image / 255.0

    # 4. Resize
    resize_image = transform.resize(normalized_frame,[110,84])

    return resize_image

In [None]:
def create_frame_stack(frame,existing_stack,max_len=16) :

    # 1. Create processed frame
    processed_frame = process_frame(frame)

    # 2. append it to the stack
    existing_stack.append(processed_frame)

    # 3. if length of stack exceeds 4 then remove the last one
    if len(existing_stack) > max_len :
        existing_stack = existing_stack[1:]
    
    # 4. create a numpy array
    #np_frame = np.array(existing_stack)
    np_frame = np.array([existing_stack[15],existing_stack[10],existing_stack[5],existing_stack[1]])

    return np_frame, existing_stack

In [None]:
sample_frame = env.reset()
sample_processed = process_frame(sample_frame)
print(sample_processed.shape)

In [None]:
# Create a pytorch DQN
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [None]:
# Reference for this class - https://github.com/Curt-Park/rainbow-is-all-you-need

class NoisyLinear(nn.Module):
    """Noisy linear module for NoisyNet.
    
    Attributes:
        in_features (int): input size of linear module
        out_features (int): output size of linear module
        std_init (float): initial std value
        weight_mu (nn.Parameter): mean value weight parameter
        weight_sigma (nn.Parameter): std value weight parameter
        bias_mu (nn.Parameter): mean value bias parameter
        bias_sigma (nn.Parameter): std value bias parameter
        
    """

    def __init__(self, in_features: int, out_features: int, std_init: float = 0.5):
        """Initialization."""
        super(NoisyLinear, self).__init__()
        
        self.in_features = in_features
        self.out_features = out_features
        self.std_init = std_init

        self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(
            torch.Tensor(out_features, in_features)
        )
        self.register_buffer(
            "weight_epsilon", torch.Tensor(out_features, in_features)
        )

        self.bias_mu = nn.Parameter(torch.Tensor(out_features))
        self.bias_sigma = nn.Parameter(torch.Tensor(out_features))
        self.register_buffer("bias_epsilon", torch.Tensor(out_features))

        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        """Reset trainable network parameters (factorized gaussian noise)."""
        mu_range = 1 / math.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(
            self.std_init / math.sqrt(self.in_features)
        )
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(
            self.std_init / math.sqrt(self.out_features)
        )

    def reset_noise(self):
        """Make new noise."""
        epsilon_in = self.scale_noise(self.in_features)
        epsilon_out = self.scale_noise(self.out_features)

        # outer product
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(epsilon_out)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward method implementation.
        
        We don't use separate statements on train / eval mode.
        It doesn't show remarkable difference of performance.
        """
        return F.linear(
            x,
            self.weight_mu + self.weight_sigma * self.weight_epsilon,
            self.bias_mu + self.bias_sigma * self.bias_epsilon,
        )
    
    @staticmethod
    def scale_noise(size: int) -> torch.Tensor:
        """Set scale to make noise (factorized gaussian noise)."""
        x = torch.FloatTensor(np.random.normal(loc=0.0, scale=1.0, size=size))

        return x.sign().mul(x.abs().sqrt())


In [None]:
# This now implements the Dueling Architecture
class DQN(nn.Module) :
    def __init__(self,actions_size) :
        super(DQN,self).__init__()

        self.actions_size = actions_size
        self.conv1 = nn.Conv2d(4,20,(4,4),4)
        #torch.nn.init.xavier_normal_(self.conv1.weight)
        self.pool1 = nn.MaxPool2d(4,1)

        self.conv2 = nn.Conv2d(20,32,(2,2),2)
        #torch.nn.init.xavier_normal_(self.conv2.weight)
        self.pool2 = nn.MaxPool2d((2,2),1)

        # torch.Size([64, 32, 167, 143])
        # self.linear = nn.Linear(32*11*8,512)
        #torch.nn.init.uniform_(self.linear.weight)
        # self.linear2 = nn.Linear(512,actions_size)
        #torch.nn.init.uniform_(self.linear2.weight)

        # Using NoisyLinear Layer instead of Linear Layer
        self.linear = NoisyLinear(32*11*8, 512)
        self.linear2 = NoisyLinear(512, actions_size)

        self.adv_layer = nn.Linear(actions_size,actions_size)
        self.val_layer = nn.Linear(actions_size,1)

    def forward(self,x) :

        batch_size = x.size(0)

        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool2(x)

        #print(x.size())
        x = x.view(-1,32*11*8)
        x = self.linear(x)
        #x = torch.tanh(x)
        x = F.relu(x)

        x = self.linear2(x)

        x = F.relu(x)

        # This is the Addition of the Dueling DQN Architecture
        advantage = self.adv_layer(x)

        value = self.val_layer(x).expand(batch_size,self.actions_size)

        complete_q_vals = value + advantage - advantage.mean(1).unsqueeze(1).expand(batch_size,self.actions_size)

        return complete_q_vals


    def reset_noise(self):
        """Reset all noisy layers."""
        self.linear.reset_noise()
        self.linear2.reset_noise()

In [None]:
# Create a buffer
class Buffer(object) :
    def __init__(self,max_size) :
        self.max_size = max_size
        self.buffer = list()
    
    def add(self,experience) :
        self.buffer.append(experience)
        #self.probabilities.append(max(self.probabilities,default=1))
        self.resize()
        
    def resize(self) :
        n = max(len(self.buffer) - self.max_size,0)
        while n > 0 :
            self.buffer.pop(0)
            #self.probabilities.pop(0)
            n -= 1
        

    def sample(self,batch_size) :
        sample_size = min(len(self.buffer),batch_size)

        random_perm_list = np.random.permutation(len(self.buffer)).tolist()
        random_perm_list = random_perm_list[:sample_size]

        return [self.buffer[i] for i in random_perm_list]

    def __len__(self) :
        return len(self.buffer)

In [None]:
# Define the SumTree Class
class SumTree(object) :
    """
    Implementation of the SumTree Class
    """
    def __init__(self,capacity) :
        self.capacity = capacity # Number of leaf nodes (final nodes) that contain experience
        self.tree = np.zeros(2*capacity - 1)
        self.data = np.zeros(capacity,dtype=object)
        self.data_pointer = 0
    
    """
    Add priority score in the sumTree leaf and add the experience in data
    """
    def add(self,priority,data) :
        # Look at what index we want to put the experience
        tree_index = self.data_pointer + self.capacity - 1


        # Update the data frame
        self.data[self.data_pointer] = data

        # Update the leaf
        self.update(tree_index,priority)

        # Add 1 to the data_pointer
        self.data_pointer += 1
        # If we're above the capacity, you go back to the first index (we overwrite)
        if self.data_pointer >= self.capacity :
            self.data_pointer = 0
    """
    Update the leaf priority score and propogate the change through tree
    """
    def update(self,tree_index,priority) :
        # Change  = new priority score - former priority score
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority

        # then propogate the change through tree
        while tree_index != 0 :
            tree_index = (tree_index - 1)//2
            self.tree[tree_index] += change

    """
    Here we get the leaf_index, priority value of that leaf and experience associated with that index
    """
    def get_leaf(self,v) :
        parent_index = 0

        while True :
            left_child_index = 2*parent_index + 1
            right_child_index = left_child_index + 1

            # If we reach bottom, end the search
            if left_child_index >= len(self.tree) :
                leaf_index = parent_index
                break
            else :
                # downward search, always search for a higher priority node
                if v <= self.tree[left_child_index] :
                    parent_index = left_child_index
                else :
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
        
        data_index = leaf_index - self.capacity + 1
        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self) :
        return self.tree[0] # returns the root node

In [None]:
class PERBuffer(object) :
    def __init__(self,capacity) :
        self.PER_e = 0.01
        self.PER_a = 0.6
        self.PER_b = 0.4
        self.PER_b_increment_per_sampling = 0.001
        self.absolute_error_upper = 1 # clipped abs error

        # Now we make the tree
        self.tree = SumTree(capacity)

    def add(self,experience) :
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])

        # If the max priority = 0, we can't put priority = 0, since this will never have a change to be selected
        # So we use a minimum priority
        if max_priority == 0 :
            max_priority = self.absolute_error_upper
        
        self.tree.add(max_priority,experience) # set the max p for new experience

    def sample(self,batch_size) :
        # Create a sample array that contains the minibatch
        sample_experiences = list()

        indexes = list()
        importance_weights = list()

        # Compute the priority
        priority_segment = self.tree.total_priority / batch_size

        # Here we increase PER_b each time a new batch is sampled
        self.PER_b = np.min([1.,self.PER_b + self.PER_b_increment_per_sampling])

        # Calculating the max_weight
        p_min = np.min(self.tree.tree[-self.tree.capacity:])/self.tree.total_priority
        max_weight = (p_min*batch_size) ** (-self.PER_b)

        for i in range(batch_size) :
            # Sample a value uniformly from each range
            a, b  = priority_segment*i, priority_segment*(i + 1)
            value = np.random.uniform(a,b)

            # Experience that correspong to each value is retrieved
            index, priority, experience = self.tree.get_leaf(value)

            # P(j)
            sample_probilities = priority / self.tree.total_priority

            # Importance weights
            importance_weight = np.power(batch_size*sample_probilities,-self.PER_b) / max_weight
            
            importance_weights.append(importance_weight)
            indexes.append(index)
            sample_experiences.append(experience)

        #print(len(indexes))
        #print(len(importance_weights))
        return sample_experiences, indexes, np.array(importance_weights)
    
    def set_priorities(self,indexes,errors) :
        #print(errors.shape)
        #print(len(indexes))
        for i in range(len(indexes)) :
            index = indexes[i]
            error = errors[i]
            self.tree.update(index,error)

In [None]:
# Create a buffer
class PERBuffer_OLD :
    def __init__(self,max_size) :
        self.max_size = max_size
        self.buffer = list()
        self.probabilities = list()
    
    def add(self,experience) :
        self.buffer.append(experience)
        self.probabilities.append(max(self.probabilities,default=1))
        #print(len(self.probabilities))
        self.resize()
        
    def resize(self) :
        n = max(len(self.buffer) - self.max_size,0)
        while n > 0 :
            self.buffer.pop(0)
            self.probabilities.pop(0)
            n -= 1

    def get_proabilities(self,alpha=1) :
        np_probs = np.array(self.probabilities) ** alpha
        sample_probs = np_probs / np_probs.sum()
        #print(sample_probs.sum())
        return sample_probs

    def get_importance(self,sample_probs,indexes,beta=1) :
        #N = len(self.buffer)
        N = self.max_size # right now instead of N being a variying number, I am using a constant N
        importance_weights = list()
        for i in indexes :
            importance_weights.append(1.0/sample_probs[i])
        
        np_importance = np.array(importance_weights)/N
        np_importance_normalized = (np_importance ** beta) / np.max(np_importance)
        return np_importance_normalized

    def set_priorities(self,indexes,error) :
        offset = 0.1
        #print(error.shape)
        for i in range(len(indexes)) :
            self.probabilities[indexes[i]] = error[i] + offset

    def sample(self,batch_size) :
        sample_size = min(len(self.buffer),batch_size)
        sample_probs = self.get_proabilities(1)

        #random_perm_list = np.random.permutation(len(self.buffer)).tolist()
        random_perm_list = np.random.choice(np.arange(len(self.buffer)),size=sample_size,p=sample_probs).tolist()
        #random_perm_list = random_perm_list[:sample_size]
        importance = self.get_importance(sample_probs,random_perm_list,1)

        return [self.buffer[i] for i in random_perm_list], random_perm_list, importance

In [None]:
import copy

In [None]:
# Create an empty stack
def create_new_stack(name_env) :
    new_stack = list()
    for _ in range(16) :
        new_stack.append(np.zeros((110,84),dtype=np.int))
    return new_stack

In [None]:
# Fill up the buffer
def fill_up_buffer(num_exp) :
    existing_stack = create_new_stack('SpaceInvaders-v0')
    # collect all examples randomly
    state = env.reset()
    state, existing_stack = create_frame_stack(state,existing_stack)

    step = 0
    prev_action = action_space.sample()

    num_lives = env.ale.lives()
    for _ in range(num_exp) :

        if step % 4 == 0 :
            action = action_space.sample()
        else :
            action = prev_action

        next_state, reward, done, info = env.step(action)

        lives_remaining = env.ale.lives()
        if lives_remaining < num_lives :
          done = True
          num_lives = lives_remaining

        # Introduce Reward Clipping here also
        reward = max(reward,1) # get a positive reward if survived
        reward = max(reward,4) # Get additional points for a hit

        if done :
            next_state = np.zeros((210,160,3),dtype=np.int)
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)

            #print("Reward when done : ",reward)

            if step % 13 == 0 :
                existing_buffer.add((state,action,-2,next_state,done))
                per_existing_buffer.add((state,action,-2,next_state,done))
        else :
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)
            if step % 13 == 0 :
                existing_buffer.add((state,action,reward,next_state,done))
                per_existing_buffer.add((state,action,reward,next_state,done))
            state = next_state
            #print("Reward in normal ",reward)
        
        step += 1

In [None]:
# This is train function with Dueling DDQN
def train(batch_size,gamma) :
    # This is the train loop, where I combine all the learning steps
    # 1. get all lists
    states_list = list()
    actions_list = list()
    rewards_list = list()
    next_states_list = list()
    dones_list = list()
    batch = existing_buffer.sample(batch_size)
    for item in batch :
        state, action, reward, next_state, done = item
        states_list.append(state)
        actions_list.append(action)
        rewards_list.append(reward)
        next_states_list.append(next_state)
        if done :
            dones_list.append(0)
        else :
            dones_list.append(1)
    
    np_states = np.asarray(states_list)
    np_actions = np.asarray(actions_list)
    np_next_states = np.asarray(next_states_list)
    np_dones = np.asarray(dones_list)
    np_rewards = np.asarray(rewards_list)

    torch_states = Variable(torch.from_numpy(np_states)).type(torch.float)
    torch_actions = Variable(torch.from_numpy(np_actions)).type(torch.LongTensor)
    torch_next_state = Variable(torch.from_numpy(np_next_states)).type(torch.float)
    torch_dones = Variable(torch.from_numpy(np_dones)).type(torch.long)
    torch_rewards = Variable(torch.from_numpy(np_rewards)).type(torch.long)

    # transfer tensors to cuda
    torch_states = torch_states.cuda()
    torch_actions = torch_actions.cuda()
    torch_next_state = torch_next_state.cuda()
    torch_dones = torch_dones.cuda()
    torch_rewards = torch_rewards.cuda()

    all_q_vals = dqn(torch_states)
    pred_q_val = all_q_vals.gather(1,torch_actions.unsqueeze(1)).squeeze(1)

    # below should be a target dqn, next q vals are estimated using our own DQN, and also we compute 
    with torch.no_grad() :
        next_state_q_vals = target_dqn(torch_next_state)
        next_state_target_q_vals = dqn(torch_next_state)


    # Next state actions are estimated using our dqn whilst the next value of those actions are estimated using our ddqn  
    next_state_q_actions = next_state_q_vals.max(1)[1]
    next_state_q_val = next_state_target_q_vals.gather(1,next_state_q_actions.unsqueeze(1)).squeeze(1)
    
    #next_state_q_val = next_state_q_vals.max(1)[0]
    #next_state_q_val = next_state_q_vals.gather(1,next_state_q_vals.max(1)[1].unsqueeze(1)).squeeze(1)
    expected_q_val = torch_rewards + gamma * next_state_q_val * torch_dones
    

    # now form the targets
    #targets = list()
    #for i in range(len(dones_list)) :
    #    if dones_list[i] :
    #        targets.append([rewards_list[i]])
    #    else :
    #        targets.append([rewards_list[i] + gamma*max_q_list[i]])
    #
    #np_targets = np.array(targets)
    #torch_targets = torch.from_numpy(np_targets)

    #torch_targets = torch_targets.cuda()
    #print(torch_targets.size())

    
    #loss = (pred_q_val - expected_q_val.detach()).pow(2).mean()
    loss = criterion(pred_q_val,expected_q_val.detach()).mean()
    optimizer.zero_grad()
    loss.backward()

    dqn.reset_noise()
    target_dqn.reset_noise()



    for param in dqn.parameters() :
        param.grad.data.clamp_(-1,1)
    optimizer.step()

    # delete what is useless
    del states_list, actions_list, rewards_list, next_states_list, dones_list, batch, np_states, np_next_states, np_actions, np_dones, np_rewards

    return loss.item()

In [None]:
def per_train(batch_size,gamma) :
    # This is the train loop, where I combine all the learning steps
    # 1. get all lists
    states_list = list()
    actions_list = list()
    rewards_list = list()
    next_states_list = list()
    dones_list = list()
    batch, indexes, importance = per_existing_buffer.sample(batch_size)

    #print("----BATCH_SIZE")
    #print(len(indexes))
    #print(len(batch))
    for item in batch :
        state, action, reward, next_state, done = item
        states_list.append(state)
        actions_list.append(action)
        rewards_list.append(reward)
        next_states_list.append(next_state)
        if done :
            dones_list.append(0)
        else :
            dones_list.append(1)
    
    np_states = np.asarray(states_list)
    np_actions = np.asarray(actions_list)
    np_next_states = np.asarray(next_states_list)
    np_dones = np.asarray(dones_list)
    np_rewards = np.asarray(rewards_list)
    
    torch_states = Variable(torch.from_numpy(np_states)).type(torch.float)
    torch_actions = Variable(torch.from_numpy(np_actions)).type(torch.LongTensor)
    torch_next_state = Variable(torch.from_numpy(np_next_states)).type(torch.float)
    torch_dones = Variable(torch.from_numpy(np_dones)).type(torch.long)
    torch_rewards = Variable(torch.from_numpy(np_rewards)).type(torch.long)

    # transfer tensors to cuda
    torch_states = torch_states.cuda()
    torch_actions = torch_actions.cuda()
    torch_next_state = torch_next_state.cuda()
    torch_dones = torch_dones.cuda()
    torch_rewards = torch_rewards.cuda()

    all_q_vals = per_dqn(torch_states)
    pred_q_val = all_q_vals.gather(1,torch_actions.unsqueeze(1)).squeeze(1)
    
    # below are the changes for PER + Dueling + Double DQN
    #with torch.no_grad() :
    #    next_state_q_vals = per_target_dqn(torch_next_state.float())
        
    #next_state_q_val = next_state_q_vals.max(1)[0]
    ##next_state_q_val = next_state_q_vals.gather(1,next_state_q_vals.max(1)[1].unsqueeze(1)).squeeze(1)
    # below should be a target dqn, next q vals are estimated using our own DQN, and also we compute
    with torch.no_grad() :
        next_state_q_vals = per_target_dqn(torch_next_state)
        next_state_target_q_vals = per_dqn(torch_next_state)

    # Next state actions are estimated using our dqn whilst the next value of those actions are estimated using our ddqn  
    next_state_q_actions = next_state_q_vals.max(1)[1]
    next_state_q_val = next_state_target_q_vals.gather(1,next_state_q_actions.unsqueeze(1)).squeeze(1)

    expected_q_val = torch_rewards + gamma * next_state_q_val * torch_dones

    #print("--Init_Compute---")
    #print(expected_q_val.size())
    #print(pred_q_val.size())
    '''max_q_vals = next_state_q_vals.max(1)[0]
    max_q_list = max_q_vals.cpu().detach().numpy().tolist()

    # now form the targets
    targets = list()
    for i in range(len(dones_list)) :
        if dones_list[i] :
            targets.append([rewards_list[i]])
        else :
            targets.append([rewards_list[i] + gamma*max_q_list[i]])
    
    np_targets = np.array(targets)
    torch_targets = torch.from_numpy(np_targets)

    torch_targets = torch_targets.cuda()
    #print(torch_targets.size())'''
    #torch_importance = Variable(torch.from_numpy(importance)).type(torch.float) #** 2
    #torch_importance = Variable(torch.from_numpy(importance)
    torch_importance = torch.from_numpy(importance)
    torch_importance = torch_importance.cuda()

    per_optimizer.zero_grad()
    #loss = per_criterion(torch_targets * torch_importance,pred_q_val * torch_importance)
    loss_vec = torch_importance*per_criterion(pred_q_val,expected_q_val.detach())
    #print("----LOSS_VEC_SHAPE----")
    #print(loss_vec.size())
    loss = loss_vec.mean()
    loss.backward()

    per_dqn.reset_noise()
    per_target_dqn.reset_noise()



    for param in per_dqn.parameters() :
        param.grad.data.clamp_(-1,1)
        
    per_optimizer.step()

    # compute the TD-Error
    #print("---TD SIZES----")
    #print(expected_q_val.size())
    #print(pred_q_val.size())

    error = abs(expected_q_val.cpu().detach().numpy() - pred_q_val.cpu().detach().numpy())

    #error = error / np.max(error)
    #print("Shape of error")
    #print(error.shape)
    per_existing_buffer.set_priorities(indexes,error)

    # delete what is useless
    del states_list, actions_list, rewards_list, next_states_list, dones_list, batch, np_states, np_next_states, np_actions, np_dones, np_rewards

    return loss.item()

In [None]:
def take_action(state) :
    #print(state.shape)


    #print("Policy Action Taken")
    batch_state = np.array([state])
    torch_state = Variable(torch.from_numpy(batch_state)).type(torch.float)
    torch_state = torch_state.cuda()
    with torch.no_grad() :
        q_vals = dqn(torch_state)
    
    action = q_vals.max(1)[1].item()
        #action = action_tensor.item()
    return action

In [None]:
def per_take_action(state) :
    #print(state.shape)
    #print("Policy Action Taken")
    batch_state = np.array([state])
    torch_state = torch.from_numpy(batch_state)
    torch_state = torch_state.cuda()
    with torch.no_grad() :
        q_vals = per_dqn(torch_state.float())
    
    action_tensor = q_vals.max(1)[1]
    action = action_tensor.item()
    
    return action

In [None]:
# Time to collect the buffer
def collect_experience() :
    # The goal is to add more experience to this buffer
    existing_stack = create_new_stack('SpaceInvaders-v0')
    state = env.reset()
    state, existing_stack = create_frame_stack(state,existing_stack)
    #print(state.shape)
    step = 0
    done = False
    total_reward = 0.0
    prev_action = action_space.sample()
    loss = 0.0
    num_lives = env.ale.lives()
    #max_lives = env.ale.lives()
    loss = 0.0
    train_step = 0
    while num_lives > 0 :
        if step % 4 == 0:
            action = take_action(state)

            #if step % 1000 == 0 :
            #  target_dqn.load_state_dict(dqn.state_dict())

        else :
            action = prev_action

        next_state, reward, done, info = env.step(action)

        lives_remaining = env.ale.lives()
        if lives_remaining < num_lives :
          done = True
          num_lives = lives_remaining
          life_lost = True

        prev_action = action
        total_reward += reward

        # Introducing Reward Clipping
        reward = max(reward,1) # Atleast one point if not died
        reward = min(reward,4) # Additional 3 points for every hit

        if done :
            terminal_state = np.zeros((210,160,3),dtype=np.int)
            terminal_state, existing_stack = create_frame_stack(terminal_state,existing_stack)

            if step % 13 == 0 :
                existing_buffer.add((state,action,-2,terminal_state,done))
            #break

            t_loss = 0.0
            for _ in range(10) :
              t_loss += train(32,0.95)
            
            t_loss /= 10
            loss += t_loss
            train_step += 1

            done = False
            existing_stack = create_new_stack('SpaceInvaders-v0')
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)
            state = next_state

        else :
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)

            if step % 13 == 0 :
                existing_buffer.add((state,action,reward,next_state,done))

            state = next_state
        step += 1

    return total_reward, loss/train_step

In [None]:
# Buffer but with Prioritized Experience Replay
def per_collect_experience() :
    # The goal is to add more experience to this buffer
    existing_stack = create_new_stack('SpaceInvaders-v0')
    state = env.reset()
    state, existing_stack = create_frame_stack(state,existing_stack)
    #print(state.shape)
    step = 0
    done = False
    total_reward = 0.0
    prev_action = action_space.sample()
    loss = 0.0
    num_lives = env.ale.lives()
    loss = 0.0
    train_step = 0
    while num_lives > 0 :
        if step % 4 == 0:
            action = per_take_action(state)
            #loss += per_train(64,0.95)
            #train_step += 1
            #if step % 10000 == 0 :
            #  per_target_dqn.load_state_dict(per_dqn.state_dict())
            
        else :
            action = prev_action

        next_state, reward, done, info = env.step(action)

        lives_remaining = env.ale.lives()
        if lives_remaining < num_lives :
          done = True
          num_lives = lives_remaining

        prev_action = action
        total_reward += reward

        # Introducing Reward Clipping
        reward = max(reward,1) # Atleast one point if not died
        reward = min(reward,4) # Additional 3 points for every hit

        if done :
            terminal_state = np.zeros((210,160,3),dtype=np.int)
            terminal_state, existing_stack = create_frame_stack(terminal_state,existing_stack)

            if step % 13 == 0 :
                per_existing_buffer.add((state,action,-2,terminal_state,done))
            #break

            t_loss = 0.0
            for _ in range(10) :
              t_loss += per_train(32,0.95)

            t_loss /= 10
            loss += t_loss
            train_step += 1

            done = False
            existing_stack = create_new_stack('SpaceInvaders-v0')
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)
            state = next_state

        else :
            next_state, existing_stack = create_frame_stack(next_state,existing_stack)

            if step % 13 == 0 :
                per_existing_buffer.add((state,action,reward,next_state,done))

            state = next_state
        step += 1

    return total_reward, loss/train_step

In [None]:
existing_buffer = Buffer(10000)
per_existing_buffer = PERBuffer(10000)
fill_up_buffer(100)

In [None]:
# Here are the steps we need to follow
# 1. Create a buffer
# 2. Fill it up
# 3. for episode in episodes
#   3a Collect experience
#   3b Learning from those experience

total_rewards = list()

decay_factor = 0.995
dqn = DQN(action_space.n)
dqn = dqn.cuda()
target_dqn = DQN(action_space.n)
target_dqn = target_dqn.cuda()
target_dqn.load_state_dict(dqn.state_dict())

criterion = F.smooth_l1_loss
optimizer = optim.Adam(dqn.parameters(),lr=1e-6)
update_interval = 8
tau = 1e-3


max_collected_reward = 0.0
batch_updates = 20

loss_list = list()


loss = 0.0
running_average = list()
for episode in range(500) :

    total_reward, loss = collect_experience()

    running_average.append(total_reward)
    
    if len(running_average) > 10 :
        running_average.pop(0)
    #total_reward /= 10
    print("---------------------------------")
    print("Episode :",episode)
    print("Loss : ",loss)
    print("Total Reward :",total_reward)
    print("Max Rewards Seen Untill Now :",max_collected_reward)
    print("Length of Buffer is :",len(existing_buffer.buffer))

    print("Running Average is :",sum(running_average)/10)
    print("-----------------------------")
    loss_list.append(loss)

    #print(len(existing_buffer.buffer))
    total_rewards.append(total_reward)


    if total_reward > max_collected_reward :
        max_collected_reward = total_reward
        #target_dqn.load_state_dict(dqn.state_dict())
        torch.save(dqn.state_dict(),'best_rl_model.pt')

    # Introducing soft-update
    if episode % update_interval :
      for target_dqn_param, dqn_param in zip(target_dqn.parameters(),dqn.parameters()) :
        target_dqn_param.data.copy_(tau*dqn_param.data + (1.0-tau)*target_dqn_param.data)
    #if episode % update_interval == 0 :
    #    target_dqn.load_state_dict(dqn.state_dict())

In [None]:
window = 100
plt.xlabel('Episode')
plt.ylabel('Total Reward (SMA 100)')
plt.plot([np.mean(total_rewards[tr:tr+window]) for tr in range(window, len(total_rewards)-window)],'b',label='NoPER')
plt.legend(loc='upper left')
plt.savefig('RewardChartNoPERSpaceInvaders.png')
plt.show()

In [None]:
# THIS IS THE PRIORITIZED EXPERIENCE REPLAY TRAIN LOOP
per_total_rewards = list()

decay_factor = 0.995
per_dqn = DQN(action_space.n)
per_dqn = per_dqn.cuda()

per_target_dqn = DQN(action_space.n)
per_target_dqn = per_target_dqn.cuda()

per_target_dqn.load_state_dict(per_dqn.state_dict())

per_criterion = F.smooth_l1_loss
#per_optimizer = optim.SGD(per_dqn.parameters(),lr=0.01,momentum=0.9)
per_optimizer = optim.Adam(per_dqn.parameters(),lr=1e-6)
update_interval = 8
tau = 1e-3
per_max_collected_reward = 0.0
batch_updates = 50

per_loss_list = list()

running_average = list()

for episode in range(500) :

    total_reward, loss = per_collect_experience()

    running_average.append(total_reward)
    
    if len(running_average) > 10 :
        running_average.pop(0)
    #total_reward /= 10
    print("------------PER TRAIN LOOP---------------------")
    print("Episode :",episode)
    print("Loss : ",loss)
    print("Total Reward :",total_reward)
    print("Max Rewards Seen Untill Now :",per_max_collected_reward)
    print("Length of Buffer is :",per_existing_buffer.tree.data_pointer)

    print("Running Average is :",sum(running_average)/10)
    print("-----------------------------")
    per_loss_list.append(loss)

    #print(len(existing_buffer.buffer))
    per_total_rewards.append(total_reward)


    if total_reward > per_max_collected_reward :
        per_max_collected_reward = total_reward
        #per_target_dqn.load_state_dict(per_dqn.state_dict())
        torch.save(per_dqn.state_dict(),'best_per_rl_model.pt')

    # Introducing soft-update
    if episode % update_interval :
      for per_target_dqn_param, per_dqn_param in zip(per_target_dqn.parameters(),per_dqn.parameters()) :
        per_target_dqn_param.data.copy_(tau*per_dqn_param.data + (1.0-tau)*per_target_dqn_param.data)

    #if episode % update_interval == 0 :
    #    per_target_dqn.load_state_dict(per_dqn.state_dict())

In [None]:
window = 100
plt.xlabel('Episode')
plt.ylabel('Total Reward (SMA 100)')
plt.plot([np.mean(per_total_rewards[tr:tr+window]) for tr in range(window, len(per_total_rewards)-window)],'b',label='With PER Dueling DDQN')
plt.legend(loc='upper left')
plt.savefig('RewardChartPERSpaceInvaders.png')
plt.show()

In [None]:
window = 100
plt.xlabel('Episode')
plt.ylabel('Total Rewards (SMA 100)')
plt.plot([np.mean(total_rewards[tr:tr+window]) for tr in range(window, len(total_rewards) - window)],'b',label='Dueling DQN')
plt.plot([np.mean(per_total_rewards[tr:tr+window]) for tr in range(window, len(per_total_rewards) - window)],'r',label='Dueling DQN with Prioritized Experience Replay')
plt.legend(loc='upper left')
plt.savefig('RewardChartWithPERSpaceInvaders.png')
plt.show()

In [None]:
# This is the next step in evolution for the thPER

In [None]:
window = 10
plt.plot([np.mean(loss_list[tr:tr+window]) for tr in range(window, len(loss_list) - window)],label='Loss for Dueling Double DQN')
plt.plot([np.mean(per_loss_list[tr:tr+window]) for tr in range(window, len(per_loss_list) - window)],label='Loss for Dueling Double DQN with PER')
plt.legend(loc='upper left')