In [2]:
import tensorflow as tf
import numpy as np
import collections
import gym
import random

In [3]:
ENV = "CartPole-v0"

In [4]:
MEMORY_SIZE = 1000
EPISODES = 2000
MAX_STEP = 500
BATCH_SIZE = 32
UPDATE_PERIOD = 50

In [5]:
##built class for the DQN
class DeepQNetwork():
    def __init__(self , env , sess=None , gamma = 0.8, epsilon = 0.8 ):
        self.gamma = gamma
        self.epsilon = epsilon
        self.action_dim = env.action_space.n
        self.state_dim = env.observation_space.shape[0]
        self.network()
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        tf.summary.FileWriter("DQN/summaries" , sess.graph )
        
    # net_frame using for creating Q & target network
    def net_frame(self , scope , collections_name , inputs):
        weights_init = tf.truncated_normal_initializer(0 , 0.3)
        bias_init = tf.constant_initializer(0.1)
        
        with tf.variable_scope(scope):
            with tf.variable_scope("layer1"):
                weights1 = tf.get_variable(name = "weights" , dtype = tf.float32 , shape = [self.state_dim , 64] ,
                                           initializer = weights_init , collections = collections_name )
                bias1 = tf.get_variable(name = "bias" , dtype = tf.float32 , shape = [64] , initializer = bias_init ,
                                         collections = collections_name)
                wx_b = tf.matmul(self.inputs_q , weights1) + bias1 
                h1 = tf.nn.relu( wx_b )
                
            with tf.variable_scope("layer2"):
                weights2 = tf.get_variable(name = "weights" , dtype = tf.float32 , shape = [64 , 32] ,
                                           initializer = weights_init , collections = collections_name )
                bias2 = tf.get_variable(name = "bias" , dtype = tf.float32 , shape = [32] , initializer = bias_init ,
                                         collections = collections_name)
                wx_b = tf.matmul( h1 , weights2) + bias2 
                h2 = tf.nn.relu( wx_b )
                
            with tf.variable_scope("layer3"):
                weights3 = tf.get_variable(name = "weights" , dtype = tf.float32 , shape = [32 , self.action_dim] , 
                                           initializer = weights_init , collections = collections_name )
                bias3 = tf.get_variable(name = "bias" , dtype = tf.float32 , shape = [self.action_dim], 
                                        initializer = bias_init , collections = collections_name)
                q_out = tf.matmul(h2 , weights3 ) + bias3 
                
            return q_out
        
    # create q_network & target_network     
    def network(self):       
        # q_network
        self.inputs_q = tf.placeholder(dtype = tf.float32 , shape = [None , self.state_dim] , name = "inputs_q")
        scope_var = "q_network" 
        clt_name_var = ["q_net_prmt" , tf.GraphKeys.GLOBAL_VARIABLES]    
        self.q_value = self.net_frame(scope_var , clt_name_var , self.inputs_q )
            
        # target_network
        self.inputs_target = tf.placeholder(dtype = tf.float32 , shape = [None , self.state_dim] , name = "inputs_target")
        scope_tar = "target_network" 
        clt_name_tar = ["target_net_prmt" , tf.GraphKeys.GLOBAL_VARIABLES]    
        self.q_target = self.net_frame(scope_tar , clt_name_tar , self.inputs_target )
               
        with tf.variable_scope("loss"):
            self.target = tf.placeholder(dtype = tf.float32 , shape = [None , self.action_dim] , name = "target")
            self.loss = tf.reduce_mean( tf.square(self.q_value - self.target))

        with tf.variable_scope("train"):
            self.train_op = tf.train.RMSPropOptimizer(0.01).minimize(self.loss)    
    
    # training
    def train(self , state , reward , action , state_next):
        q , q_target = self.sess.run([self.q_value , self.q_target] , 
                                     feed_dict={self.inputs_q : state , self.inputs_target : state_next } )
        target = reward + self.gamma * np.max(q_target , axis = 1)

        self.reform_target = q
        batch_index = np.arange(BATCH_SIZE , dtype = np.int32)
        self.reform_target[batch_index , action] = target
    
        loss , _ = self.sess.run([self.loss , self.train_op] , feed_dict={self.inputs_q: state , self.target: self.reform_target} )
    
    # chose action
    def chose_action(self , current_state):
        current_state = current_state[np.newaxis , :]  #*** array dim: (xx,)  --> (1 , xx) ***
        q = self.sess.run(self.q_value , feed_dict={self.inputs_q : current_state} )
        
        # e-greedy
        if np.random.random() < self.epsilon:
            action_chosen = np.random.randint(0 , self.action_dim)
        else:
            action_chosen = np.argmax(q)
        
        return action_chosen
    
    def greedy_action(self , current_state):
        current_state = current_state[np.newaxis , :]  
        q = self.sess.run(self.q_value , feed_dict={self.inputs_q : current_state} ) 
        action_greedy = np.argmax(q)
        return action_greedy
         
    #upadate parmerters
    def update_prmt(self):
        q_prmts = tf.get_collection("q_net_prmt")
        target_prmts = tf.get_collection("target_net_prmt")
        self.sess.run( [tf.assign(t , q)for t,q in zip(target_prmts , q_prmts)])  #***
        print("updating target-network parmeters...")
        
    def decay_epsilon(self):
        self.epsilon = self.epsilon - 0.1

In [6]:
# memory for momery replay
memory = []
Transition = collections.namedtuple("Transition" , ["state", "action" , "reward" , "next_state" , "done"])

In [7]:
if __name__ == "__main__":
    env = gym.make(ENV)
    with tf.Session() as sess:
        DQN = DeepQNetwork(env , sess )
        update_iter = 0
        step_his = []
        for episode in range(EPISODES):
            state = env.reset()
            env.render() 
            reward_all = 0
#training
            for step in range(MAX_STEP):
                action = DQN.chose_action(state)
                next_state , reward , done , _ = env.step(action)
                reward_all += reward 

                if len(memory) > MEMORY_SIZE:
                    memory.pop(0)
                memory.append(Transition(state, action , reward , next_state , done))

                if len(memory) > BATCH_SIZE:
                    batch_transition = random.sample(memory , BATCH_SIZE)
                    #***
                    batch_state, batch_action, batch_reward, batch_next_state, batch_done = map(np.array , zip(*batch_transition))  
                    DQN.train(state = batch_state ,
                              reward = batch_reward , 
                              action = batch_action , 
                              state_next = batch_next_state
                             )
                    update_iter += 1

                if update_iter % UPDATE_PERIOD == 0:
                    DQN.update_prmt()
                
                if update_iter % 200 == 0:
                    DQN.decay_epsilon()

                if done:
                    break
                    
                state = next_state
            

               

[2017-05-25 19:18:47,290] Making new env: CartPole-v0


updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
u

updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
updating target-network parmeters...
u

KeyboardInterrupt: 

In [None]:
reward_his
