In [1]:
import tensorflow as tf
import numpy as np
import collections
import gym
import random
import tensorflow.contrib.layers as layers

In [2]:
ENV = "CartPole-v0"

In [3]:
MEMORY_SIZE = 10000
EPISODES = 1000
MAX_STEP = 500
BATCH_SIZE = 32
UPDATE_PERIOD = 200  # update target network parameters

CLIP_NORM = None

In [4]:
##built class for the DQN
class DeepQNetwork():
    def __init__(self , scope_main ,  env , sess=None , gamma = 0.8, epsilon = 0.8 , dueling = True , out_graph = False , out_dqn = True):
        self.gamma = gamma
        self.epsilon = epsilon
        self.loss_his = []
        
        self.scope_main = scope_main
        self.dueling = dueling
        self.out_dqn = out_dqn
        
        self.action_dim = env.action_space.n
        self.state_dim = env.observation_space.shape[0]
        self.network()
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        tf.summary.FileWriter("DQN/summaries" , sess.graph )
        
    # net_frame using for creating Q & target network
    def net_frame(self , hiddens, inpt, num_actions, scope, hiddens_a , hiddens_v , reuse=None):
        with tf.variable_scope(scope, reuse=reuse):
            out = inpt  
            for hidden in hiddens:
                out = layers.fully_connected(out,  num_outputs=hidden, activation_fn=tf.nn.relu)
            
            if self.dueling == True :
                # value_stream
                with tf.variable_scope("value_stream"):
                    value = out
                    for hidden in hiddens_v:
                        value = layers.fully_connected(value, num_outputs= hidden , activation_fn=None) 
                    value = layers.fully_connected(value, num_outputs= 1 , activation_fn=None) 

                # advantage_stream
                with tf.variable_scope("advantage_stream"):
                    advantage = out
                    for hidden in hiddens_a:
                        advantage = layers.fully_connected(advantage , num_outputs = hidden , activation_fn=None) 
                    advantage = layers.fully_connected(advantage , num_outputs= num_actions , activation_fn=None) 

                # aggregating_moudle
                with tf.variable_scope("aggregating_moudle"):
                    q_out = value + advantage - tf.reduce_mean(advantage , axis = 1 , keep_dims = True )  # ***keep_dims

            elif self.out_dqn:
                with tf.variable_scope("dqn_out"):
                    q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 

            return q_out
        
    # create q_network & target_network     
    def network(self):       
        # q_network
        self.inputs_q = tf.placeholder(dtype = tf.float32 , shape = [None , self.state_dim] , name = "inputs_q")
        scope_var = "q_network"    
        self.q_value = self.net_frame([64] , self.inputs_q , self.action_dim , scope_var , [20] , [20] , reuse = True )
            
        # target_network
        self.inputs_target = tf.placeholder(dtype = tf.float32 , shape = [None , self.state_dim] , name = "inputs_target")
        scope_tar = "target_network"    
        self.q_target = self.net_frame([64] , self.inputs_target , self.action_dim , scope_tar , [20] , [20] )
               
        with tf.variable_scope("loss"):
            self.action = tf.placeholder(dtype = tf.int32 , shape = [ None ] , name = "action")
            action_one_hot = tf.one_hot(self.action , self.action_dim )
            q_action = tf.reduce_sum( tf.multiply(self.q_value , action_one_hot) , axis = 1 ) 
            
            self.target =  tf.placeholder(dtype = tf.float32 , shape =  [None ] , name = "target")
            self.loss = tf.reduce_mean( tf.square(q_action - self.target))

        with tf.variable_scope("train"):
            optimizer = tf.train.RMSPropOptimizer(0.001)
            if CLIP_NORM :
                gradients = optimizer.compute_gradients(self.loss)
                for i , (g, v) in enumerate(gradients):
                    if g is not None:
                        gradients[i] = (tf.clip_by_norm(g , 10) , v)
                self.train_op = optimizer.apply_gradients(gradients)
            else:
                self.train_op = optimizer.minimize(self.loss)    
    
    # training
    def train(self , state , reward , action , state_next , done):
        q , q_target = self.sess.run([self.q_value , self.q_target] , 
                                     feed_dict={self.inputs_q : state , self.inputs_target : state_next } )

        q_target_best = np.max(q_target , axis = 1)
        q_target_best_mask = ( 1.0 - done) * q_target_best
        
        target = reward + self.gamma * q_target_best_mask
        
        loss , _ = self.sess.run([self.loss , self.train_op] , 
                                 feed_dict={self.inputs_q: state , self.target:target , self.action:action} )   
        self.loss_his.append(loss)
        
    # chose action
    def chose_action(self , current_state):
        current_state = current_state[np.newaxis , :]  #*** array dim: (xx,)  --> (1 , xx) ***
        q = self.sess.run(self.q_value , feed_dict={self.inputs_q : current_state} )
        
        # e-greedy
        if np.random.random() < self.epsilon:
            action_chosen = np.random.randint(0 , self.action_dim)
        else:
            action_chosen = np.argmax(q)
        
        return action_chosen
         
    #upadate parmerters
    def update_prmt(self):
        q_prmts = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES ,  self.scope_main + "/q_network"  )
        target_prmts = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, self.scope_main + "/target_network"  )
        self.sess.run( [tf.assign(t , q)for t,q in zip(target_prmts , q_prmts)])  #***
        print("updating target-network parmeters...")
        
    def decay_epsilon(self):
        if self.epsilon > 0.03:
            self.epsilon = self.epsilon - 0.02
        
    def greedy_action(self , current_state):
        current_state = current_state[np.newaxis , :]  
        q = self.sess.run(self.q_value , feed_dict={self.inputs_q : current_state} ) 
        action_greedy = np.argmax(q)
        return action_greedy

In [5]:
# memory for momery replay
memory = []
Transition = collections.namedtuple("Transition" , ["state", "action" , "reward" , "next_state" , "done"])

In [9]:
def train( DQN , env ):
    reward_his = []
    all_reward = 0
    step_his = []
    update_iter = 0
    for episode in range(EPISODES):
        state = env.reset()
#         env.render() 
#         reward_all = 0
        #training
        for step in range(MAX_STEP):
            action = DQN.chose_action(state)
            next_state , reward , done , _ = env.step(action)
            all_reward += reward 

            if len(memory) > MEMORY_SIZE:
                memory.pop(0)
            memory.append(Transition(state, action , reward , next_state , float(done)))

            if len(memory) > BATCH_SIZE * 4:
                batch_transition = random.sample(memory , BATCH_SIZE)
                #***
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = map(np.array , zip(*batch_transition))  
                DQN.train(state = batch_state ,
                          reward = batch_reward , 
                          action = batch_action , 
                          state_next = batch_next_state,
                          done = batch_done
                         )
                update_iter += 1

            if update_iter % UPDATE_PERIOD == 0:
                DQN.update_prmt()

            if update_iter % 200 == 0:
                DQN.decay_epsilon()

            if done:
                step_his.append(step)
                reward_his.append(all_reward)
                print("[episode= {} ] step = {}".format(episode , step))
                break

            state = next_state
            
    loss_his = DQN.loss_his
    return [step_his , reward_his , loss_his]

In [10]:
if __name__ == "__main__":
    env = gym.make(ENV)
    with tf.Session() as sess:
        with tf.variable_scope("DQN"):
            DQN = DeepQNetwork( "DQN" , env , sess , dueling = False , out_graph = False , out_dqn = True )
        with tf.variable_scope("Deuling"):
            Dueling = DeepQNetwork("Deuling" , env , sess , dueling = True , out_graph = False , out_dqn = False )
        
        step_dqn , reward_dqn , loss_dqn = train(DQN , env)
        step_dueling , reward_dueling , loss_dueling = train(Dueling , env)

[2017-06-25 19:05:12,701] Making new env: CartPole-v0


updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
[episode= 0 ] step = 12
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating targe

[episode= 140 ] step = 8
[episode= 141 ] step = 8
[episode= 142 ] step = 8
[episode= 143 ] step = 9
[episode= 144 ] step = 8
[episode= 145 ] step = 9
[episode= 146 ] step = 9
[episode= 147 ] step = 8
[episode= 148 ] step = 10
[episode= 149 ] step = 11
[episode= 150 ] step = 9
[episode= 151 ] step = 9
[episode= 152 ] step = 9
[episode= 153 ] step = 8
[episode= 154 ] step = 8
[episode= 155 ] step = 16
[episode= 156 ] step = 13
updating target-network parmeters...
[episode= 157 ] step = 11
[episode= 158 ] step = 13
[episode= 159 ] step = 13
[episode= 160 ] step = 19
[episode= 161 ] step = 14
[episode= 162 ] step = 11
[episode= 163 ] step = 12
[episode= 164 ] step = 11
[episode= 165 ] step = 16
[episode= 166 ] step = 11
[episode= 167 ] step = 15
[episode= 168 ] step = 16
[episode= 169 ] step = 13
[episode= 170 ] step = 8
updating target-network parmeters...
[episode= 171 ] step = 9
[episode= 172 ] step = 9
[episode= 173 ] step = 7
[episode= 174 ] step = 12
[episode= 175 ] step = 9
[episode

[episode= 390 ] step = 150
updating target-network parmeters...
[episode= 391 ] step = 199
updating target-network parmeters...
[episode= 392 ] step = 194
updating target-network parmeters...
[episode= 393 ] step = 164
updating target-network parmeters...
[episode= 394 ] step = 142
updating target-network parmeters...
[episode= 395 ] step = 199
updating target-network parmeters...
[episode= 396 ] step = 148
updating target-network parmeters...
[episode= 397 ] step = 181
updating target-network parmeters...
[episode= 398 ] step = 180
[episode= 399 ] step = 195
updating target-network parmeters...
[episode= 400 ] step = 156
updating target-network parmeters...
[episode= 401 ] step = 153
updating target-network parmeters...
[episode= 402 ] step = 143
[episode= 403 ] step = 142
updating target-network parmeters...
[episode= 404 ] step = 150
updating target-network parmeters...
[episode= 405 ] step = 146
updating target-network parmeters...
[episode= 406 ] step = 173
updating target-network

updating target-network parmeters...
[episode= 566 ] step = 188
updating target-network parmeters...
[episode= 567 ] step = 187
updating target-network parmeters...
[episode= 568 ] step = 199
updating target-network parmeters...
[episode= 569 ] step = 199
updating target-network parmeters...
[episode= 570 ] step = 199
[episode= 571 ] step = 131
updating target-network parmeters...
[episode= 572 ] step = 158
updating target-network parmeters...
[episode= 573 ] step = 122
updating target-network parmeters...
[episode= 574 ] step = 126
[episode= 575 ] step = 121
updating target-network parmeters...
[episode= 576 ] step = 120
[episode= 577 ] step = 122
updating target-network parmeters...
[episode= 578 ] step = 131
updating target-network parmeters...
[episode= 579 ] step = 127
[episode= 580 ] step = 142
updating target-network parmeters...
[episode= 581 ] step = 135
updating target-network parmeters...
[episode= 582 ] step = 182
updating target-network parmeters...
[episode= 583 ] step = 

[episode= 726 ] step = 87
updating target-network parmeters...
[episode= 727 ] step = 137
updating target-network parmeters...
[episode= 728 ] step = 119
updating target-network parmeters...
[episode= 729 ] step = 151
[episode= 730 ] step = 163
updating target-network parmeters...
[episode= 731 ] step = 199
updating target-network parmeters...
[episode= 732 ] step = 184
updating target-network parmeters...
[episode= 733 ] step = 199
updating target-network parmeters...
[episode= 734 ] step = 199
updating target-network parmeters...
[episode= 735 ] step = 52
[episode= 736 ] step = 87
updating target-network parmeters...
[episode= 737 ] step = 199
updating target-network parmeters...
[episode= 738 ] step = 199
updating target-network parmeters...
[episode= 739 ] step = 199
updating target-network parmeters...
[episode= 740 ] step = 199
[episode= 741 ] step = 97
updating target-network parmeters...
[episode= 742 ] step = 199
updating target-network parmeters...
[episode= 743 ] step = 118


[episode= 904 ] step = 199
updating target-network parmeters...
[episode= 905 ] step = 193
updating target-network parmeters...
[episode= 906 ] step = 197
updating target-network parmeters...
[episode= 907 ] step = 168
updating target-network parmeters...
[episode= 908 ] step = 199
updating target-network parmeters...
[episode= 909 ] step = 171
updating target-network parmeters...
[episode= 910 ] step = 199
updating target-network parmeters...
[episode= 911 ] step = 199
updating target-network parmeters...
[episode= 912 ] step = 189
updating target-network parmeters...
[episode= 913 ] step = 150
updating target-network parmeters...
[episode= 914 ] step = 153
[episode= 915 ] step = 154
updating target-network parmeters...
[episode= 916 ] step = 199
updating target-network parmeters...
[episode= 917 ] step = 199
updating target-network parmeters...
[episode= 918 ] step = 199
[episode= 919 ] step = 30
updating target-network parmeters...
[episode= 920 ] step = 89
updating target-network p

[episode= 90 ] step = 199
updating target-network parmeters...
[episode= 91 ] step = 199
updating target-network parmeters...
[episode= 92 ] step = 199
updating target-network parmeters...
[episode= 93 ] step = 199
updating target-network parmeters...
[episode= 94 ] step = 199
updating target-network parmeters...
[episode= 95 ] step = 199
updating target-network parmeters...
[episode= 96 ] step = 199
updating target-network parmeters...
[episode= 97 ] step = 199
updating target-network parmeters...
[episode= 98 ] step = 199
updating target-network parmeters...
[episode= 99 ] step = 199
updating target-network parmeters...
[episode= 100 ] step = 199
updating target-network parmeters...
[episode= 101 ] step = 199
updating target-network parmeters...
[episode= 102 ] step = 199
updating target-network parmeters...
[episode= 103 ] step = 199
updating target-network parmeters...
[episode= 104 ] step = 199
updating target-network parmeters...
[episode= 105 ] step = 199
updating target-network

[episode= 253 ] step = 26
updating target-network parmeters...
[episode= 254 ] step = 94
[episode= 255 ] step = 116
updating target-network parmeters...
[episode= 256 ] step = 121
updating target-network parmeters...
[episode= 257 ] step = 98
[episode= 258 ] step = 109
updating target-network parmeters...
[episode= 259 ] step = 118
[episode= 260 ] step = 108
updating target-network parmeters...
[episode= 261 ] step = 121
[episode= 262 ] step = 133
updating target-network parmeters...
[episode= 263 ] step = 116
updating target-network parmeters...
[episode= 264 ] step = 126
updating target-network parmeters...
[episode= 265 ] step = 167
updating target-network parmeters...
[episode= 266 ] step = 199
updating target-network parmeters...
[episode= 267 ] step = 199
updating target-network parmeters...
[episode= 268 ] step = 199
updating target-network parmeters...
[episode= 269 ] step = 199
[episode= 270 ] step = 170
updating target-network parmeters...
[episode= 271 ] step = 199
updating 

updating target-network parmeters...
[episode= 424 ] step = 199
[episode= 425 ] step = 139
updating target-network parmeters...
[episode= 426 ] step = 168
updating target-network parmeters...
[episode= 427 ] step = 137
updating target-network parmeters...
[episode= 428 ] step = 123
[episode= 429 ] step = 117
updating target-network parmeters...
[episode= 430 ] step = 165
updating target-network parmeters...
[episode= 431 ] step = 120
[episode= 432 ] step = 116
updating target-network parmeters...
[episode= 433 ] step = 108
[episode= 434 ] step = 150
updating target-network parmeters...
[episode= 435 ] step = 121
updating target-network parmeters...
[episode= 436 ] step = 100
[episode= 437 ] step = 88
updating target-network parmeters...
[episode= 438 ] step = 196
[episode= 439 ] step = 98
updating target-network parmeters...
[episode= 440 ] step = 127
updating target-network parmeters...
[episode= 441 ] step = 95
[episode= 442 ] step = 66
[episode= 443 ] step = 93
updating target-netwo

updating target-network parmeters...
[episode= 587 ] step = 187
updating target-network parmeters...
[episode= 588 ] step = 158
updating target-network parmeters...
[episode= 589 ] step = 89
[episode= 590 ] step = 118
updating target-network parmeters...
[episode= 591 ] step = 140
updating target-network parmeters...
[episode= 592 ] step = 171
updating target-network parmeters...
[episode= 593 ] step = 199
updating target-network parmeters...
[episode= 594 ] step = 144
updating target-network parmeters...
[episode= 595 ] step = 199
[episode= 596 ] step = 123
updating target-network parmeters...
[episode= 597 ] step = 199
updating target-network parmeters...
[episode= 598 ] step = 156
updating target-network parmeters...
[episode= 599 ] step = 145
[episode= 600 ] step = 113
updating target-network parmeters...
[episode= 601 ] step = 137
updating target-network parmeters...
[episode= 602 ] step = 168
updating target-network parmeters...
[episode= 603 ] step = 166
[episode= 604 ] step = 1

[episode= 743 ] step = 138
[episode= 744 ] step = 51
updating target-network parmeters...
[episode= 745 ] step = 115
[episode= 746 ] step = 59
updating target-network parmeters...
[episode= 747 ] step = 84
[episode= 748 ] step = 29
[episode= 749 ] step = 28
[episode= 750 ] step = 28
[episode= 751 ] step = 29
updating target-network parmeters...
[episode= 752 ] step = 27
[episode= 753 ] step = 27
[episode= 754 ] step = 25
[episode= 755 ] step = 20
[episode= 756 ] step = 88
[episode= 757 ] step = 22
updating target-network parmeters...
[episode= 758 ] step = 118
[episode= 759 ] step = 11
updating target-network parmeters...
[episode= 760 ] step = 121
[episode= 761 ] step = 125
updating target-network parmeters...
[episode= 762 ] step = 49
updating target-network parmeters...
[episode= 763 ] step = 183
[episode= 764 ] step = 125
updating target-network parmeters...
[episode= 765 ] step = 91
[episode= 766 ] step = 100
updating target-network parmeters...
[episode= 767 ] step = 110
updating

[episode= 910 ] step = 124
updating target-network parmeters...
[episode= 911 ] step = 15
[episode= 912 ] step = 128
updating target-network parmeters...
[episode= 913 ] step = 196
updating target-network parmeters...
[episode= 914 ] step = 199
updating target-network parmeters...
[episode= 915 ] step = 99
[episode= 916 ] step = 42
[episode= 917 ] step = 82
updating target-network parmeters...
[episode= 918 ] step = 84
updating target-network parmeters...
[episode= 919 ] step = 189
[episode= 920 ] step = 60
[episode= 921 ] step = 29
updating target-network parmeters...
[episode= 922 ] step = 105
[episode= 923 ] step = 45
[episode= 924 ] step = 110
updating target-network parmeters...
[episode= 925 ] step = 112
updating target-network parmeters...
[episode= 926 ] step = 199
updating target-network parmeters...
[episode= 927 ] step = 126
[episode= 928 ] step = 80
updating target-network parmeters...
[episode= 929 ] step = 176
updating target-network parmeters...
[episode= 930 ] step = 16