In [1]:
import numpy as np
import matplotlib.pyplot as plt
from time import sleep

class ENV:
    def __init__(self, map_size, obs_pos1, obs_pos2, robot_start_pos, goal_pos):
        self.map_size = map_size # integer
        self.obs_pos1 = obs_pos1 # [a,b], 2by1 list
        self.obs_pos2 = obs_pos2 # [a,b], 2by1 list
        self.goal_pos = goal_pos
        self.robot_pos1 = robot_start_pos[0:2] #[a,b] 2by1 list
        self.robot_pos2 = robot_start_pos[2:4]
        # set the walls
        self.fig = plt.figure()
        ax = plt.axes(xlim=(-0.5,self.map_size), ylim=(-0.5,self.map_size))  
        #self.render_env()
    def render_env(self):        
        # draw the obstacles and goal
        obs1 = plt.scatter(self.obs_pos1[0], self.obs_pos1[1], c='r', marker = 's', linewidths = 5) # have to check whether we can receive <list or np.array>        
        obs2 = plt.scatter(self.obs_pos2[0], self.obs_pos2[1], c='r', marker = 's', linewidths = 5)
        goal = plt.scatter(self.goal_pos[0], self.goal_pos[1], c='g', marker='x', linewidths = 4)
        # draw the robot                
        ro1 = plt.scatter(self.robot_pos1[0], self.robot_pos1[1], c='b', linewidths = 3)
        ro2 = plt.scatter(self.robot_pos2[0], self.robot_pos2[1], c='b', linewidths = 3)        
        self.fig.canvas.draw()   
        sleep(0.2)
        ro1.remove()
        ro2.remove()        
    def update(self, robot_current_pos):
        self.robot_pos1 = robot_current_pos[0:2]
        self.robot_pos2 = robot_current_pos[2:4]        

    

In [2]:
# Plotting setting
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import animation
from time import sleep

import numpy as np
import tensorflow as tf
import random
import dqn_cooperation
from collections import deque

import time
start = time.time()


# Create New environment with transition law
ACTION_NUM = 5
INPUT_SIZE = 10
OUTPUT_SIZE = ACTION_NUM**2
VEL = 0.5
TIME_GAP = 1
MAP_SIZE = 7
def annealing_epsilon(episode, min_e, max_e, target_episode):

    slope = (min_e - max_e) / (target_episode)
    intercept = max_e

    return max(min_e, slope * episode + intercept)

class new_env:     
    def create_env(self, arg_state=[1.,2.,1.,3.], g_pos=[5.,5.], obs_pos1=[2.,2.], obs_pos2=[3.,4.], obs_size=5):
        self.state = np.array(arg_state+g_pos+obs_pos1+obs_pos2) # reset
        self.n_state = np.array(arg_state+g_pos+obs_pos1+obs_pos2)
        self.obstacle_size = obs_size
        return self.state, self.obstacle_size
    #def add_obs(self, obs_pos), we postpone this 
    
    def next_step(self, arg_state, arg_action):

        self._fail = False
        self.reward = 0
        # convert to each action
        arg_action1 = arg_action // ACTION_NUM
        arg_action2 = arg_action - ACTION_NUM*arg_action1
        '''position update through action
        UP = 0, DOWN = 1, LEFT = 2, RIGHT = 3'''        
        # agent 1
        if arg_action1 == 0:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,1,0,0])*VEL*TIME_GAP
        elif arg_action1 == 1:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,-1,0,0])*VEL*TIME_GAP
        elif arg_action1 == 2:
            self.n_state[0:4] = arg_state[0:4] + np.array([-1,0,0,0])*VEL*TIME_GAP
        elif arg_action1 == 3:
            self.n_state[0:4] = arg_state[0:4] + np.array([1,0,0,0])*VEL*TIME_GAP
        else:
            self.n_state[0:4] = arg_state[0:4] # stop        
        # agent 2  
        if arg_action2 == 0:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,0,1])*VEL*TIME_GAP
        elif arg_action2 == 1:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,0,-1])*VEL*TIME_GAP
        elif arg_action2 == 2:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,-1,0])*VEL*TIME_GAP
        elif arg_action1 == 3:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,1,0])*VEL*TIME_GAP      
        else:
            self.n_state[0:4] = arg_state[0:4] # stop   
            
        '''get the reward'''
        if np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])!=0:
            self.reward = (1/np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])-\
            1/np.linalg.norm((arg_state[0:2]+arg_state[2:4])/2-self.n_state[4:6]))*100
        if np.linalg.norm(self.n_state[0:2]-self.n_state[6:8])<1 or np.linalg.norm(self.n_state[2:4]-self.n_state[6:8])<1:
            self.reward = self.reward-1 # collision
        if np.linalg.norm(self.n_state[0:2]-self.n_state[8:10])<1 or np.linalg.norm(self.n_state[2:4]-self.n_state[8:10])<1:
            self.reward = self.reward-1 # collision
        if np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])<1 and np.linalg.norm(self.n_state[0:2]-self.n_state[2:4])<3: # approximately set condition
            self.reward = self.reward + 1000 # achieve goal
            self._fail = True
        if np.linalg.norm(self.n_state[0:2]-self.n_state[2:4])>2.5:
            self.reward = self.reward-2 # drop the object
            #self._fail = True     
        if any(x<0 for x in self.n_state):            
            self.reward = self.reward-10 # away from the map
            #self._fail = True   
        if any(x>MAP_SIZE for x in self.n_state):            
            self.reward = self.reward-10 # away from the map
            #self._fail = True     
        return self.n_state, self.reward, self._fail
    
#env = new_env() 
#state, g_pos, o_pos, o_size = env.create_env() # set the enviornment
DISCOUNT_RATE = 0.98
REPLAY_MEMORY = 10000
BATCH_SIZE = 50
MAX_EPI = 6000
MAX_STEP = 3000
# minimum epsilon for epsilon greedy
MIN_E = 0.1
# epsilon will be `MIN_E` at `EPSILON_DECAYING_STEP`
EPSILON_DECAYING_STEP = MAX_STEP * 0.2
TARGET_UPDATE_FQ = 20

def train_minibatch(mainDQN, targetDQN, minibatch):
    state_array = np.array([x[0] for x in minibatch])
    action_array = np.array([x[1] for x in minibatch]) # [ x among 0~24] * BATCH_SIZE
    reward_array = np.array([x[2] for x in minibatch])
    n_state_array = np.array([x[3] for x in minibatch])
    _fail_array = np.array([x[4] for x in minibatch])    
    
    X_batch = state_array   
    Y_batch = mainDQN.predict(state_array) # 25 elements * BATCH_SIZE 
    
    Q_target = reward_array + DISCOUNT_RATE*np.max(targetDQN.predict(n_state_array),axis=1)*~_fail_array # if fail, Q = reward
    
    Y_batch[np.arange(len(X_batch)), action_array] = Q_target
    
    # Train
    cost_batch, _ = mainDQN.update(X_batch, Y_batch)
    return cost_batch

def get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))

    return op_holder


def main():
    replay_buffer = deque(maxlen=REPLAY_MEMORY) # detract element from both sides    
    total_reward_buffer = []
    with tf.Session() as sess:
        mainDQN = dqn_cooperation.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name = "main")
        mainDQN.build_network(32,64,0.005)
        targetDQN = dqn_cooperation.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name = "target")
        targetDQN.build_network(32,64,0.005)
        init = tf.global_variables_initializer()
        sess.run(init)
        
        # initial copy main q -> target q
        copy_ops = get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main")
        sess.run(copy_ops)
        
        reward_accum_last100 = 0
        reward_sum = 0        

        game = ENV(MAP_SIZE, [2,2], [3,4], [1,2,1,3], [5,5])
        game.render_env()
        
        for episode in range(MAX_EPI):
            
            '''
            if episode < 100:
                e = 0.3
            e = 0.05
            '''
            _fail = False
            step_count = 0 # how many moves included in an episode
            env1 = new_env()
            state, _= env1.create_env() # get only state
            
            
            reward_sum = 0      
            goal_ = True
            while not _fail:
                e = annealing_epsilon(step_count, MIN_E, 1.0, EPSILON_DECAYING_STEP)
                # after sufficient learning, we present the game scene
                if episode > MAX_EPI-2:
                    game.update(state)
                    game.render_env()
                    
                step_count += 1
                if np.random.rand()< e:
                    action_l = random.sample(np.arange(ACTION_NUM),2) # choose up, down, left, right, stop
                    action = action_l[0]*(ACTION_NUM) + action_l[1] # convert to index
                else:
                    act_candi = mainDQN.predict(state)
                    action = np.argmax(act_candi)   
                    '''
                    dd_predict = mainDQN.predict(state).flatten()
                    aa = np.max(dd_predict)
                    max_indx, = np.where(dd_predict==aa)                    
                    action = random.sample(max_indx,1)[0]
                    '''
                
                n_state, reward, _fail = env1.next_step(state, action) # have to input the action 
                # if count >30, stop that episode and start new episode
                if step_count >MAX_STEP-1:
                    #reward = -30
                    _fail = True
                    goal_ = False
                    
                reward_sum += reward    # sum total reward and penalty about long time(-0.5)                                  
                replay_buffer.append((state, action, reward, n_state, _fail)) #resolve the correlation                
                state = n_state
                
                # train minibatch of main Q-NET and update the target Q-network from main Q-NET
                if len(replay_buffer)>BATCH_SIZE*4:
                    minibatch = random.sample(replay_buffer, BATCH_SIZE)                    
                    train_minibatch(mainDQN, targetDQN, minibatch)
                if step_count % TARGET_UPDATE_FQ == 0:
                    sess.run(copy_ops)
                    
            total_reward_buffer.append(reward_sum)  
            if goal_ == True:
                print("[Episode {:>5}]  total reward: {:>5}    steps: {:>5}, success".format(episode, reward_sum, step_count))
            else:
                print("[Episode {:>5}]  total reward: {:>5}     steps: {:>5}, failure".format(episode, reward_sum, step_count))
    #print("Success ratio: {}".format(reward_accum_last100/100))
        fig1 =plt.figure()
        plt.plot(range(MAX_EPI), total_reward_buffer)
        plt.show()
    # save model
    '''
    saver = tf.train.Saver()
    save_path = saver.save(sess, "./dqn_multi_action.ckpt")
    '''
    
            
        
if __name__ == "__main__":
    main()    
    end = time.time()-start
    print(end)       



  from ._conv import register_converters as _register_converters


<IPython.core.display.Javascript object>

[Episode     0]  total reward: -55214.0     steps:  3000, failure
[Episode     1]  total reward: -51734.0     steps:  3000, failure
[Episode     2]  total reward: -60483.0     steps:  3000, failure
[Episode     3]  total reward: -50787.9191525     steps:  3000, failure
[Episode     4]  total reward: -59199.0     steps:  3000, failure
[Episode     5]  total reward: -56296.0     steps:  3000, failure
[Episode     6]  total reward: -63469.4105692     steps:  3000, failure
[Episode     7]  total reward: -62842.4105692     steps:  3000, failure
[Episode     8]  total reward: -49098.0     steps:  3000, failure
[Episode     9]  total reward: -59281.4105692     steps:  3000, failure
[Episode    10]  total reward: -52671.0     steps:  3000, failure
[Episode    11]  total reward: -40802.0     steps:  3000, failure
[Episode    12]  total reward: -53797.9191525     steps:  3000, failure
[Episode    13]  total reward: -58934.0     steps:  3000, failure
[Episode    14]  total reward: -64894.9191525 

[Episode   118]  total reward: -63323.9191525     steps:  3000, failure
[Episode   119]  total reward: -43824.9191525     steps:  3000, failure
[Episode   120]  total reward: -50534.9191525     steps:  3000, failure
[Episode   121]  total reward: -5738.0    steps:   430, success
[Episode   122]  total reward: -62550.5989095     steps:  3000, failure
[Episode   123]  total reward: -64573.4105692     steps:  3000, failure
[Episode   124]  total reward: -37661.5989095     steps:  3000, failure
[Episode   125]  total reward: -54633.0     steps:  3000, failure
[Episode   126]  total reward: -40446.9191525     steps:  3000, failure
[Episode   127]  total reward: -37747.9191525     steps:  3000, failure
[Episode   128]  total reward: -62963.9191525     steps:  3000, failure
[Episode   129]  total reward: -65069.5989095     steps:  3000, failure
[Episode   130]  total reward: -60589.9191525     steps:  3000, failure
[Episode   131]  total reward: -53906.5989095     steps:  3000, failure
[Episo

[Episode   235]  total reward: -58956.5989095     steps:  3000, failure
[Episode   236]  total reward: -38795.0     steps:  3000, failure
[Episode   237]  total reward: -63158.0     steps:  3000, failure
[Episode   238]  total reward: -39296.0     steps:  3000, failure
[Episode   239]  total reward: -58214.0     steps:  3000, failure
[Episode   240]  total reward: -64124.5989095     steps:  3000, failure
[Episode   241]  total reward: -45581.0119498     steps:  3000, failure
[Episode   242]  total reward: -1128.01194975    steps:   133, success
[Episode   243]  total reward: -36056.5989095     steps:  3000, failure
[Episode   244]  total reward: -35924.0     steps:  3000, failure
[Episode   245]  total reward: -49216.9191525     steps:  3000, failure
[Episode   246]  total reward: -62177.0119498     steps:  3000, failure
[Episode   247]  total reward: -64901.0     steps:  3000, failure
[Episode   248]  total reward: -35808.0     steps:  3000, failure
[Episode   249]  total reward: -630

[Episode   353]  total reward: -65708.9191525     steps:  3000, failure
[Episode   354]  total reward: -6880.41056917    steps:   388, success
[Episode   355]  total reward: -65552.4105692     steps:  3000, failure
[Episode   356]  total reward: 825.988050249    steps:    33, success
[Episode   357]  total reward: 942.0    steps:    25, success
[Episode   358]  total reward: -65368.5989095     steps:  3000, failure
[Episode   359]  total reward: 944.0    steps:    21, success
[Episode   360]  total reward: 959.0    steps:    17, success
[Episode   361]  total reward: 917.589430828    steps:    22, success
[Episode   362]  total reward: 939.0    steps:    23, success
[Episode   363]  total reward: 937.988050249    steps:    25, success
[Episode   364]  total reward: 967.988050249    steps:    16, success
[Episode   365]  total reward: 962.589430828    steps:    16, success
[Episode   366]  total reward: -65232.0119498     steps:  3000, failure
[Episode   367]  total reward: 917.58943082

[Episode   471]  total reward: 976.080847462    steps:    30, success
[Episode   472]  total reward: 997.988050249    steps:    30, success
[Episode   473]  total reward: 987.988050249    steps:    18, success
[Episode   474]  total reward: 956.988050249    steps:    18, success
[Episode   475]  total reward: 952.080847462    steps:    23, success
[Episode   476]  total reward: 989.0    steps:    15, success
[Episode   477]  total reward: 962.0    steps:    20, success
[Episode   478]  total reward: 972.988050249    steps:    17, success
[Episode   479]  total reward: 986.988050249    steps:    16, success
[Episode   480]  total reward: 981.988050249    steps:    24, success
[Episode   481]  total reward: 967.401090505    steps:    27, success
[Episode   482]  total reward: 981.988050249    steps:    21, success
[Episode   483]  total reward: 988.988050249    steps:    18, success
[Episode   484]  total reward: 942.988050249    steps:    22, success
[Episode   485]  total reward: 996.9

[Episode   589]  total reward: -65865.4105692     steps:  3000, failure
[Episode   590]  total reward: -54734.0     steps:  3000, failure
[Episode   591]  total reward: -56022.0119498     steps:  3000, failure
[Episode   592]  total reward: -35852.5989095     steps:  3000, failure
[Episode   593]  total reward: -35886.9191525     steps:  3000, failure
[Episode   594]  total reward: -35916.9191525     steps:  3000, failure
[Episode   595]  total reward: -54048.9191525     steps:  3000, failure
[Episode   596]  total reward: -35810.5989095     steps:  3000, failure
[Episode   597]  total reward: -65493.4105692     steps:  3000, failure
[Episode   598]  total reward: -39216.9191525     steps:  3000, failure
[Episode   599]  total reward: -35914.9191525     steps:  3000, failure
[Episode   600]  total reward: -35950.5989095     steps:  3000, failure
[Episode   601]  total reward: -65885.4105692     steps:  3000, failure
[Episode   602]  total reward: -42305.4105692     steps:  3000, failur

[Episode   706]  total reward: -65715.4105692     steps:  3000, failure
[Episode   707]  total reward: -62768.9191525     steps:  3000, failure
[Episode   708]  total reward: -65342.5989095     steps:  3000, failure
[Episode   709]  total reward: -61917.4105692     steps:  3000, failure
[Episode   710]  total reward: -64372.5989095     steps:  3000, failure
[Episode   711]  total reward: -60392.5989095     steps:  3000, failure
[Episode   712]  total reward: -1670.0    steps:   174, success
[Episode   713]  total reward: -45715.0     steps:  3000, failure
[Episode   714]  total reward: -49842.0     steps:  3000, failure
[Episode   715]  total reward: -65680.9191525     steps:  3000, failure
[Episode   716]  total reward: -65354.0119498     steps:  3000, failure
[Episode   717]  total reward: -65751.4105692     steps:  3000, failure
[Episode   718]  total reward: -65588.0     steps:  3000, failure
[Episode   719]  total reward: -64777.0     steps:  3000, failure
[Episode   720]  total r

[Episode   823]  total reward: -55790.0     steps:  3000, failure
[Episode   824]  total reward: -35912.5989095     steps:  3000, failure
[Episode   825]  total reward: -36161.5989095     steps:  3000, failure
[Episode   826]  total reward: -55742.0     steps:  3000, failure
[Episode   827]  total reward: -64539.5989095     steps:  3000, failure
[Episode   828]  total reward: -36567.0     steps:  3000, failure
[Episode   829]  total reward: -36120.0     steps:  3000, failure
[Episode   830]  total reward: -39336.9191525     steps:  3000, failure
[Episode   831]  total reward: -59774.5989095     steps:  3000, failure
[Episode   832]  total reward: -63827.9191525     steps:  3000, failure
[Episode   833]  total reward: -58502.0     steps:  3000, failure
[Episode   834]  total reward: -35912.0     steps:  3000, failure
[Episode   835]  total reward: -54218.0     steps:  3000, failure
[Episode   836]  total reward: -44382.0     steps:  3000, failure
[Episode   837]  total reward: -64990.01

[Episode   941]  total reward: -63462.5989095     steps:  3000, failure
[Episode   942]  total reward: -41122.9191525     steps:  3000, failure
[Episode   943]  total reward: -50071.9191525     steps:  3000, failure
[Episode   944]  total reward: -65783.4105692     steps:  3000, failure
[Episode   945]  total reward: -64962.5989095     steps:  3000, failure
[Episode   946]  total reward: -45358.0     steps:  3000, failure
[Episode   947]  total reward: -56991.9191525     steps:  3000, failure
[Episode   948]  total reward: -63614.0     steps:  3000, failure
[Episode   949]  total reward: -8771.0    steps:   487, success
[Episode   950]  total reward: -64385.9191525     steps:  3000, failure
[Episode   951]  total reward: -65501.9191525     steps:  3000, failure
[Episode   952]  total reward: -65305.0119498     steps:  3000, failure
[Episode   953]  total reward: -63129.9191525     steps:  3000, failure
[Episode   954]  total reward: 754.589430828    steps:    36, success
[Episode   955

[Episode  1058]  total reward: -62105.0     steps:  3000, failure
[Episode  1059]  total reward: -50729.0     steps:  3000, failure
[Episode  1060]  total reward: -43610.0     steps:  3000, failure
[Episode  1061]  total reward: -36808.0     steps:  3000, failure
[Episode  1062]  total reward: -63680.9191525     steps:  3000, failure
[Episode  1063]  total reward: -55381.0     steps:  3000, failure
[Episode  1064]  total reward: -63760.9191525     steps:  3000, failure
[Episode  1065]  total reward: -59536.9191525     steps:  3000, failure
[Episode  1066]  total reward: -43745.5989095     steps:  3000, failure
[Episode  1067]  total reward: -57986.5989095     steps:  3000, failure
[Episode  1068]  total reward: -64778.5989095     steps:  3000, failure
[Episode  1069]  total reward: -35814.9191525     steps:  3000, failure
[Episode  1070]  total reward: -63630.9191525     steps:  3000, failure
[Episode  1071]  total reward: -37274.9191525     steps:  3000, failure
[Episode  1072]  total

[Episode  1176]  total reward: -57927.0     steps:  3000, failure
[Episode  1177]  total reward: -41942.0119498     steps:  3000, failure
[Episode  1178]  total reward: -55004.4105692     steps:  3000, failure
[Episode  1179]  total reward: -60663.5989095     steps:  3000, failure
[Episode  1180]  total reward: -62517.9191525     steps:  3000, failure
[Episode  1181]  total reward: -46219.4105692     steps:  3000, failure
[Episode  1182]  total reward: -48324.5989095     steps:  3000, failure
[Episode  1183]  total reward: -62076.0119498     steps:  3000, failure
[Episode  1184]  total reward: -64093.4105692     steps:  3000, failure
[Episode  1185]  total reward: -50889.0     steps:  3000, failure
[Episode  1186]  total reward: -62278.9191525     steps:  3000, failure
[Episode  1187]  total reward: -65290.0     steps:  3000, failure
[Episode  1188]  total reward: -63246.9191525     steps:  3000, failure
[Episode  1189]  total reward: -64966.0     steps:  3000, failure
[Episode  1190] 

KeyboardInterrupt: 