In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import random
import gym

  from ._conv import register_converters as _register_converters


In [3]:
class Episode_experience():
    def __init__(self):
        self.memory = []
        
    def add(self, state, action, reward, next_state, done):
        self.memory += [(state, action, reward, next_state, done)]
        
    def clear(self):
        self.memory = []
        
class DQNAgent():
    def __init__(self, state_size, action_size, use_double_dqn=True, clip_target_value=False,
                 use_layer_norm=True, clip_target_min=-np.inf, clip_target_max=np.inf):
        self.state_size = state_size
        self.action_size = action_size
        self.use_double_dqn = use_double_dqn
        self.clip_target_value = clip_target_value
        self.clip_target_min = clip_target_min
        self.clip_target_max = clip_target_max
        self.memory = []
        self.epsilon = 1.0 # exploration
        self.epsilon_min = 0.1 # min exploration
        self.epsilon_decay = 0.9
        self.gamma = 0.98
        self.batch_size = 32
        self.learning_rate = 1e-3
        self.gradient_norm_clip = 10 # clip the gradient
        self.buffer_size = int(5e4)
        self._set_model(use_layer_norm=use_layer_norm)
        
    def _set_model(self, use_layer_norm): # set value network
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.tfs = tf.placeholder(tf.float32, [None, self.state_size], 'state')
        self.tfs_ = tf.placeholder(tf.float32, [None, self.state_size], 'next_state')
        self.tfa = tf.placeholder(tf.int32, [None, ], 'action')
        self.tfr = tf.placeholder(tf.float32, [None, ], 'reward')
        self.tfd = tf.placeholder(tf.float32, [None, ], 'done')
        
        def _build_qnet(state, scope, reuse=False):
            with tf.variable_scope(scope, reuse=reuse):
                if use_layer_norm:
                    net = tf.layers.dense(state, 64)
                    net = tf.contrib.layers.layer_norm(net, activation_fn=tf.nn.relu)
                else:
                    net = tf.layers.dense(state, 64, activation=tf.nn.relu)
                q = tf.layers.dense(net, self.action_size)
            return q, tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
        
        self.q_eval, self.e_params = _build_qnet(self.tfs, 'eval')
        self.q_targ, self.t_params = _build_qnet(self.tfs_, 'target')
        self.update_op = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)] # update target network op
        
        if self.use_double_dqn:
            q_eval_next, _ = _build_qnet(self.tfs_, 'eval', reuse=True) # reuse the same eval net
            q_eval_next_best_action = tf.argmax(q_eval_next, 1)
            self.q_target_value = tf.reduce_sum(self.q_targ * tf.one_hot(q_eval_next_best_action, self.action_size), 1)
        else:
            self.q_target_value = tf.reduce_max(self.q_targ, 1)
            
        self.q_target_value = self.tfr + self.gamma * (1-self.tfd) * self.q_target_value
        
        if self.clip_target_value:
            self.q_target_value = tf.clip_by_value(self.q_target_value, self.clip_target_min, self.clip_target_max)
        
        self.q_eval_value = tf.reduce_sum(self.q_eval * tf.one_hot(self.tfa, self.action_size), 1)
        
        self.loss = tf.losses.huber_loss(self.q_target_value, self.q_eval_value) # define loss
        
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        if self.gradient_norm_clip is not None:
            gradients = optimizer.compute_gradients(self.loss, var_list=self.e_params) # only update e_params
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, self.gradient_norm_clip), var)
            self.train_op = optimizer.apply_gradients(gradients)
        else:
            self.train_op = optimizer.minimize(self.loss, var_list=self.e_params) # only update e_params
        
        self.saver = tf.train.Saver()
        
        self.sess.run(tf.global_variables_initializer())
        
    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        act_values = self.sess.run(self.q_eval, {self.tfs: state})
        return np.argmax(act_values[0])
    
    def remember(self, ep_experience):
        self.memory += ep_experience.memory
        if len(self.memory) > self.buffer_size:
            self.memory = self.memory[-self.buffer_size:] # empty the first memories
        
    def replay(self, optimization_steps=1):
        if len(self.memory) < self.batch_size: # if there's no enough transitions, do nothing
            return 0
        
        losses = 0
        for _ in range(optimization_steps):
            minibatch = np.vstack(random.sample(self.memory, self.batch_size))
            ss = np.vstack(minibatch[:,0])
            acs = minibatch[:,1]
            rs = minibatch[:,2]
            nss = np.vstack(minibatch[:,3])
            ds = minibatch[:,4]
            loss, _ = self.sess.run([self.loss, self.train_op], 
                                    {self.tfs: ss, self.tfa: acs,
                                     self.tfr: rs, self.tfs_: nss, self.tfd: ds})
            
            losses += loss
        
        return losses/optimization_steps # return mean loss
    
    def update_target_net(self, decay=True):
        self.sess.run(self.update_op)
        if decay:
            self.epsilon = max(self.epsilon*self.epsilon_decay, self.epsilon_min)

# Create agent and environment

In [8]:
env = gym.make('MountainCar-v0')
agent = DQNAgent(2, 3, 1, clip_target_value=False)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


____
# Training

In [9]:
# no param noise
losses = []
rs = [0.0]

ep_experience = Episode_experience()
state = env.reset()
goal = np.array([0.]) # fixed

start_time = time.clock()
for t in range(int(1e5)):
    action = agent.choose_action([state], [goal])
    next_state, reward, done, _ = env.step(action)
    ep_experience.add(state, action, reward, next_state, done, goal)
    agent.remember(ep_experience)  
    ep_experience.clear()
    rs[-1]+=reward
    state = next_state
    if done:
        state = env.reset()
        rs += [0.0]
    if t>1000:
        losses+=[agent.replay(1)]
    if t>1000 and t%500==0:
        agent.update_target_net()
    if done and len(rs)%10==0:
        print("steps", t, "mean r %.2f"%np.mean(rs[-101:-1])) # mean over last 100 episodes
        
print("Training time : %.2f"%(time.clock()-start_time), "s")

steps 1799 mean r -200.00
steps 3799 mean r -200.00
steps 5799 mean r -200.00
steps 7799 mean r -200.00
steps 9799 mean r -200.00
steps 11799 mean r -200.00
steps 13768 mean r -199.55
steps 15746 mean r -199.33
steps 17536 mean r -197.04
steps 19366 mean r -195.63
steps 21263 mean r -194.64
steps 22818 mean r -190.19
steps 24649 mean r -188.50
steps 26565 mean r -187.66
steps 28462 mean r -186.63
steps 30316 mean r -185.17
steps 32225 mean r -184.57
steps 34117 mean r -183.71
steps 35974 mean r -184.38
steps 37816 mean r -184.50
steps 39716 mean r -184.53
steps 41635 mean r -188.17
steps 43446 mean r -187.97
steps 45160 mean r -185.95
steps 46798 mean r -183.36
steps 48242 mean r -179.26
steps 49840 mean r -176.15
steps 51335 mean r -172.18
steps 52759 mean r -167.85
steps 54200 mean r -163.84
steps 55819 mean r -161.03
steps 57669 mean r -160.34
steps 59532 mean r -160.86
steps 61101 mean r -159.41
steps 62548 mean r -157.50
steps 64133 mean r -158.91
steps 65676 mean r -158.36
steps 

____
# Testing

In [11]:
for _ in range(5):
    state = env.reset()
    goal = np.array([0.])
    r = 0
    while True:
        env.render()
        action = agent.choose_action([state], [goal])
        next_state, reward, done, _ = env.step(action)
        r += reward
        state = next_state
        if done:
            break
    print("rew :", r)

rew : -141.0
rew : -105.0
rew : -139.0
rew : -146.0
rew : -142.0


In [12]:
env.close()