In [None]:
from __future__ import print_function
import os
import mxnet as mx
import random
from mxnet import nd, autograd
from mxnet import gluon
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from IPython import display
import scipy.misc
import gym
import math
from collections import namedtuple
import time


## Set the hyper parameters

In [None]:
class Options:
    def __init__(self):
        #Articheture
        self.batch_size = 32
        self.image_size = 80
        #Trickes
        self.replay_buffer_size = 1000000
        self.start_learning = 50000
        self.learning_frequency = 4
        self.skip_frame = 4
        self.frame_len = 4
        self.Target_update = 10000
        self.epsilon_min = 0.1
        self.annealing_end = 100000.
        self.gamma = 0.99
        self.replay_start_size = 500
        self.no_op_max = 30
        self.termination_reward = -10.
        #otimization
        self.num_episode = 1000
        self.lr = 0.00025
        self.gamma1 = 0.95
        self.gamma2 = 0.95
        self.rms_eps = 0.01
        self.ctx = mx.gpu() #  enables gpu
opt = Options()
env_name = 'Assault-v0'
env = gym.make(env_name)
num_action = env.action_space.n

manualSeed = random.randint(1, 10000)
mx.random.seed(manualSeed)
attrs = vars(opt)
print (', '.join("%s: %s" % item for item in attrs.items()))

## Define the DQN network

In [None]:
DQN = gluon.nn.Sequential()
with DQN.name_scope():
    #first layer
    DQN.add(gluon.nn.Conv2D(channels=32, kernel_size=8,strides = 4,padding = 0))
    DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    DQN.add(gluon.nn.Activation('relu'))
    #second layer
    DQN.add(gluon.nn.Conv2D(channels=64, kernel_size=4,strides = 2))
    DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    DQN.add(gluon.nn.Activation('relu'))
    #tird layer
    DQN.add(gluon.nn.Conv2D(channels=64, kernel_size=3,strides = 1))
    DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    DQN.add(gluon.nn.Activation('relu'))
    DQN.add(gluon.nn.Flatten())
    #fourth layer
    DQN.add(gluon.nn.Dense(512,activation ='relu'))
    #fifth layer
    DQN.add(gluon.nn.Dense(num_action,activation ='relu'))

dqn = DQN
dqn.collect_params().initialize(mx.init.Normal(0.02), ctx=opt.ctx)
DQN_trainer = gluon.Trainer(dqn.collect_params(),'RMSProp', \
                          {'learning_rate': opt.lr ,'gamma1':opt.gamma1,'gamma2': opt.gamma2,'epsilon': opt.rms_eps,'centered' : True})
dqn.collect_params().zero_grad()


In [None]:
Target_DQN = gluon.nn.Sequential()
with Target_DQN.name_scope():
    #first layer
    Target_DQN.add(gluon.nn.Conv2D(channels=32, kernel_size=8,strides = 4,padding = 0))
    Target_DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    Target_DQN.add(gluon.nn.Activation('relu'))
    #second layer
    Target_DQN.add(gluon.nn.Conv2D(channels=64, kernel_size=4,strides = 2))
    Target_DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    Target_DQN.add(gluon.nn.Activation('relu'))
    #tird layer
    Target_DQN.add(gluon.nn.Conv2D(channels=64, kernel_size=3,strides = 1))
    Target_DQN.add(gluon.nn.BatchNorm(axis = 1, momentum = 0.1,center=True))
    Target_DQN.add(gluon.nn.Activation('relu'))
    Target_DQN.add(gluon.nn.Flatten())
    #fourth layer
    Target_DQN.add(gluon.nn.Dense(512,activation ='relu'))
    #fifth layer
    Target_DQN.add(gluon.nn.Dense(num_action,activation ='relu'))
target_dqn = Target_DQN
target_dqn.collect_params().initialize(mx.init.Normal(0.02), ctx=opt.ctx)


## Class of replay buffer

In [None]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward','done'))
class Replay_Buffer():
    def __init__(self, replay_buffer_size):
        self.replay_buffer_size = replay_buffer_size
        self.memory = []
        self.position = 0
    def push(self, *args):
        if len(self.memory) < self.replay_buffer_size:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.replay_buffer_size
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)


## Preprocess frames

In [None]:
def preprocess(raw_frame, currentState = None, initial_state = False):
    raw_frame = scipy.misc.imresize(raw_frame.mean(2), (opt.image_size,opt.image_size)).reshape([1,opt.image_size,opt.image_size]).astype(np.float32)/255.
    if initial_state == True:
        state = np.concatenate([raw_frame for _ in range(opt.frame_len)], axis =0)
    else:
        state = np.append(currentState[1:,:,:], raw_frame, axis = 0)
    return state
def rew_clipper(rew):
    if rew>0.:
        return 1.
    elif rew<0.:
        return -1.
    else:
        return 0
l2loss = gluon.loss.L2Loss(batch_axis=0)


## Initialize arrays

In [None]:
render_image = False
frame_counter = 0.
annealing_count = 0.
epis_count = 0.
replay_memory = Replay_Buffer(opt.replay_buffer_size)
value = nd.zeros(opt.batch_size,opt.ctx)
tot_clipped_reward = np.zeros(opt.num_episode)
tot_reward = np.zeros(opt.num_episode)
moving_average_clipped = 0.
moving_average = 0.

## Train the DQN 

In [None]:
for i in range(opt.num_episode):
    cum_clipped_reward = 0
    cum_reward = 0
    next_frame = env.reset()
    state = preprocess(next_frame, initial_state = True)
    t = 0.
    done = False
    while not done:
        previous_state = state
        # show the frame
        if render_image:
            plt.imshow(next_frame);
            plt.show()
            display.clear_output(wait=True)
            time.sleep(.01)
        sample = random.random()
        if frame_counter > opt.replay_start_size:
            annealing_count += 1
        if frame_counter == opt.replay_start_size:
            print('annealing and laerning are started ')
            
            
        
        eps = np.maximum(1.-annealing_count/opt.annealing_end,opt.epsilon_min)
        effective_eps = eps
        if t < opt.no_op_max:
            effective_eps = 1.
        if sample < effective_eps:
            action = random.randint(0, num_action - 1)
        else:
            data = nd.array(state.reshape([1,opt.frame_len,opt.image_size,opt.image_size]),opt.ctx)
            action = int(nd.argmax(dqn(data),axis=1).as_in_context(mx.cpu()).asscalar())
        #skip frame
        rew = 0
        for skip in range(opt.skip_frame-1):
            next_frame, reward, done,_ = env.step(action)
            cum_clipped_reward += rew_clipper(reward)
            rew += reward
                
        next_frame_new, reward, done, _ = env.step(action)
        cum_clipped_reward += rew_clipper(reward)
        rew += reward
        cum_reward += rew
        #reward clipping
        reward = rew_clipper(rew)
        next_frame = np.maximum(next_frame_new,next_frame)
        state = preprocess(next_frame, state)
        replay_memory.push(previous_state,action,state,reward,done)
        
        if frame_counter > opt.replay_start_size:        
            if frame_counter % opt.learning_frequency == 0:
                transitions = replay_memory.sample(opt.batch_size)
                batch = Transition(*zip(*transitions))
                batch_state = nd.array(batch.state,opt.ctx)
                batch_state_next = nd.array(batch.next_state,opt.ctx)
                batch_reward = nd.array(batch.reward,opt.ctx)
                batch_action = nd.array(batch.action,opt.ctx).astype('int32')
                batch_done = nd.array(batch.done,opt.ctx)#.astype('int32')
                with autograd.record():
                    Q_sp = nd.max(target_dqn(batch_state_next),axis = 1)
                    Q_sp = Q_sp*(nd.ones(opt.batch_size,ctx = opt.ctx)-batch_done)
                    Q_s_array = dqn(batch_state)
                    Q_s = nd.pick(Q_s_array,batch_action,1)
                    #loss = nd.mean(nd.square(Q_s - (batch_reward +opt.gamma * Q_sp)))
                    loss = nd.mean(l2loss(Q_s ,  (batch_reward + opt.gamma *Q_sp)))
                loss.backward()
                DQN_trainer.step(opt.batch_size)
                
        


        t += 1
        frame_counter += 1
        if frame_counter > opt.replay_start_size:
            if frame_counter % opt.Target_update == 0 :
                check_point = frame_counter / (opt.Target_update *100)
                fdqn = './saved_dqn/target_%s_%d' % (env_name,int(check_point))
                dqn.save_params(fdqn)
                target_dqn.load_params(fdqn, opt.ctx)
        if done:
            print('epis[%d],eps[%f],durat[%d],fnum=%d, cum_cl_rew = %d, cum_rew = %d,tot_cl = %d , tot = %d'\
                  %(epis_count,eps,t+1,frame_counter,cum_clipped_reward,cum_reward,moving_average_clipped,moving_average))
    epis_count += 1
    tot_clipped_reward[int(epis_count)] = cum_clipped_reward
    tot_reward[int(epis_count)] = cum_reward
    if epis_count > 50.:
#         moving_average_clipped = (epis_count * moving_average_clipped + cum_clipped_reward)/(epis_count+1.)
#         moving_average = (epis_count * moving_average + cum_reward)/(epis_count+1.)
        moving_average_clipped = np.mean(tot_clipped_reward[int(epis_count)-1-50:int(epis_count)-1])
        moving_average = np.mean(tot_reward[int(epis_count)-1-50:int(epis_count)-1])
    
    
    

In [None]:
bandwidth = 100


total_clipped = np.zeros(int(epis_count)-bandwidth)
total_rew = np.zeros(int(epis_count)-bandwidth)
for i in range(int(epis_count)-bandwidth):
    total_clipped[i] = np.sum(tot_clipped_reward[i:i+bandwidth])/bandwidth
    total_rew[i] = np.sum(tot_reward[i:i+bandwidth])/bandwidth
t = np.arange(int(epis_count)-bandwidth)
#likplt = plt.plot(t,total_clipped[0:opt.num_episode-bandwidth],"b", label = "Clipped Return")
belplt = plt.plot(t,total_rew[0:int(epis_count)-bandwidth],"r", label = "Return")
plt.legend()#handles[likplt,belplt])
plt.xlabel("Number of Samples")
plt.ylabel("Likelihood Ratio and Bellman Error")
plt.show()
           