In [1]:
import numpy as np
import matplotlib.pyplot as plt
from time import sleep

class ENV:
    def __init__(self, map_size, obs_pos1, obs_pos2, robot_start_pos, goal_pos):
        self.map_size = map_size # integer
        self.obs_pos1 = obs_pos1 # [a,b], 2by1 list
        self.obs_pos2 = obs_pos2 # [a,b], 2by1 list
        self.goal_pos = goal_pos
        self.robot_pos1 = robot_start_pos[0:2] #[a,b] 2by1 list
        self.robot_pos2 = robot_start_pos[2:4]
        # set the walls
        self.fig = plt.figure()
        ax = plt.axes(xlim=(-0.5,self.map_size), ylim=(-0.5,self.map_size))  
        #self.render_env()
    def render_env(self):        
        # draw the obstacles and goal
        obs1 = plt.scatter(self.obs_pos1[0], self.obs_pos1[1], c='r', marker = 's', linewidths = 5) # have to check whether we can receive <list or np.array>        
        obs2 = plt.scatter(self.obs_pos2[0], self.obs_pos2[1], c='r', marker = 's', linewidths = 5)
        goal = plt.scatter(self.goal_pos[0], self.goal_pos[1], c='g', marker='x', linewidths = 4)
        # draw the robot                
        ro1 = plt.scatter(self.robot_pos1[0], self.robot_pos1[1], c='b', linewidths = 3)
        ro2 = plt.scatter(self.robot_pos2[0], self.robot_pos2[1], c='b', linewidths = 3)        
        self.fig.canvas.draw()   
        sleep(0.2)
        ro1.remove()
        ro2.remove()        
    def update(self, robot_current_pos):
        self.robot_pos1 = robot_current_pos[0:2]
        self.robot_pos2 = robot_current_pos[2:4]        

    

In [2]:
# Plotting setting
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import animation
from time import sleep

import numpy as np
import tensorflow as tf
import random
import dqn_cooperation
from collections import deque

import time
start = time.time()


# Create New environment with transition law
ACTION_NUM = 5
INPUT_SIZE = 10
OUTPUT_SIZE = ACTION_NUM**2
VEL = 0.5
TIME_GAP = 1
MAP_SIZE = 7

def annealing_epsilon(episode, min_e, max_e, target_episode):

    slope = (min_e - max_e) / (target_episode)
    intercept = max_e

    return max(min_e, slope * episode + intercept)

class new_env:     
    def create_env(self, arg_state=[2.,3.,2.,4.], g_pos=[5.,5.], obs_pos1=[2.,2.], obs_pos2=[3.,4.], obs_size=5):
        self.state = np.array(arg_state+g_pos+obs_pos1+obs_pos2) # reset
        self.n_state = np.array(arg_state+g_pos+obs_pos1+obs_pos2)
        self.obstacle_size = obs_size
        return self.state, self.obstacle_size
    #def add_obs(self, obs_pos), we postpone this 
    
    def next_step(self, arg_state, arg_action):

        self._fail = False
        self.reward = 0
        # convert to each action
        arg_action1 = arg_action // ACTION_NUM
        arg_action2 = arg_action - ACTION_NUM*arg_action1
        '''position update through action
        UP = 0, DOWN = 1, LEFT = 2, RIGHT = 3'''        
        # agent 1
        if arg_action1 == 0:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,1,0,0])*VEL*TIME_GAP
        elif arg_action1 == 1:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,-1,0,0])*VEL*TIME_GAP
        elif arg_action1 == 2:
            self.n_state[0:4] = arg_state[0:4] + np.array([-1,0,0,0])*VEL*TIME_GAP
        elif arg_action1 == 3:
            self.n_state[0:4] = arg_state[0:4] + np.array([1,0,0,0])*VEL*TIME_GAP
        else:
            self.n_state[0:4] = arg_state[0:4] # stop        
        # agent 2  
        if arg_action2 == 0:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,0,1])*VEL*TIME_GAP
        elif arg_action2 == 1:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,0,-1])*VEL*TIME_GAP
        elif arg_action2 == 2:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,-1,0])*VEL*TIME_GAP
        elif arg_action1 == 3:
            self.n_state[0:4] = arg_state[0:4] + np.array([0,0,1,0])*VEL*TIME_GAP      
        else:
            self.n_state[0:4] = arg_state[0:4] # stop   
            
        '''get the reward'''
        if np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])!=0:
            self.reward = (1/np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])-\
            1/np.linalg.norm((arg_state[0:2]+arg_state[2:4])/2-self.n_state[4:6]))*100
        if np.linalg.norm(self.n_state[0:2]-self.n_state[6:8])<1 or np.linalg.norm(self.n_state[2:4]-self.n_state[6:8])<1:
            self.reward = self.reward-1 # collision
        if np.linalg.norm(self.n_state[0:2]-self.n_state[8:10])<1 or np.linalg.norm(self.n_state[2:4]-self.n_state[8:10])<1:
            self.reward = self.reward-1 # collision
        if np.linalg.norm((self.n_state[0:2]+self.n_state[2:4])/2-self.n_state[4:6])<1 and np.linalg.norm(self.n_state[0:2]-self.n_state[2:4])<3: # approximately set condition
            self.reward = self.reward + 1000 # achieve goal
            self._fail = True
        if np.linalg.norm(self.n_state[0:2]-self.n_state[2:4])>2.5:
            self.reward = self.reward-np.linalg.norm(self.n_state[0:2]-self.n_state[2:4])*2 # drop the object
            #self._fail = True     
        return self.n_state, self.reward, self._fail
    
#env = new_env() 
#state, g_pos, o_pos, o_size = env.create_env() # set the enviornment
DISCOUNT_RATE = 0.98
REPLAY_MEMORY = 10000
BATCH_SIZE = 50
MAX_EPI = 500
MAX_STEP = 500

# minimum epsilon for epsilon greedy
MIN_E = 0.1
# epsilon will be `MIN_E` at `EPSILON_DECAYING_STEP`
EPSILON_DECAYING_EPI = MAX_EPI * 0.2
TARGET_UPDATE_FQ = 100

def train_minibatch(mainDQN, targetDQN, minibatch):
    state_array = np.array([x[0] for x in minibatch])
    action_array = np.array([x[1] for x in minibatch]) # [ x among 0~24] * BATCH_SIZE
    reward_array = np.array([x[2] for x in minibatch])
    n_state_array = np.array([x[3] for x in minibatch]) # [[1,2,3,4][1,2,3,4]...as much as BATCH_SIZE NUMBER]
    _fail_array = np.array([x[4] for x in minibatch])
    
    
    X_batch = state_array   
    Y_batch = mainDQN.predict(state_array) # 25 elements * BATCH_SIZE 
    
    # consideration for action constraint 
    target_q = targetDQN.predict(n_state_array) # [[1 ...25][1...25]...batch_size]
    j = 0
    for x in n_state_array:        
        t_dqn = targetDQN.predict(x) #[[1 2 3 ...]]
        t_dqn = t_dqn.flatten() # [1 2 3 ...]
        if x[0]<TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[2*ACTION_NUM+i] = -float("inf") # put a large num on action 2(left)
        if x[1]<TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[1*ACTION_NUM+i] = -float("inf") # put a large num on action 1(down)
        if x[0] > MAP_SIZE - TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[3*ACTION_NUM+i] = -float("inf") # remove action 3(right)
        if x[1] > MAP_SIZE - TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[0*ACTION_NUM+i] = -float("inf") # remove action 0(up)  
        if x[2]<TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[i*ACTION_NUM+2] = -float("inf")# put a large num on action 2(left)
        if x[3]<TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[i*ACTION_NUM+1] = -float("inf") # put a large num on action 1(down)
        if x[2] > MAP_SIZE - TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[i*ACTION_NUM+3] = -float("inf")# remove action 3(right)
        if x[3] > MAP_SIZE - TIME_GAP*VEL:
            for i in range(ACTION_NUM):
                t_dqn[i*ACTION_NUM+0] = -float("inf") # remove action 0(up)    
        target_q[j] = t_dqn
        j += 1
        
    Q_target = reward_array + DISCOUNT_RATE*np.max(target_q, axis=1)*~_fail_array # if fail, Q = reward
    
    Y_batch[np.arange(len(X_batch)), action_array] = Q_target
    
    # Train
    cost_batch, _ = mainDQN.update(X_batch, Y_batch)
    return cost_batch

def get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))

    return op_holder


def main():
    replay_buffer = deque(maxlen=REPLAY_MEMORY) # detract element from both sides    
    total_reward_buffer = []
    step_buffer = []
    avg_q_value = []
    new_graph = tf.Graph()
    with tf.Session(graph=new_graph) as sess:
        mainDQN = dqn_cooperation.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name = "main")
        mainDQN.build_network(32,64,0.005)
        targetDQN = dqn_cooperation.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name = "target")
        targetDQN.build_network(32,64,0.005)
        init = tf.global_variables_initializer()
        sess.run(init)
        #restore model
        
        new_saver = tf.train.import_meta_graph("./dqn_multi_reward_easy_500.ckpt.meta")        
        new_saver.restore(sess,"./dqn_multi_reward_easy_500.ckpt")
        
        # initial copy main q -> target q
        copy_ops = get_copy_var_ops(dest_scope_name = "target", src_scope_name = "main")
        sess.run(copy_ops)
        
        reward_accum_last100 = 0
        reward_sum = 0
        
        game = ENV(MAP_SIZE, [2,2], [3,4], [2.,3.,2.,4.], [5,5])
        game.render_env()
        
        for episode in range(MAX_EPI):
            
            '''
            if episode < 100:
                e = 0.3
            e = 0.05
            '''
            _fail = False
            step_count = 0 # how many moves included in an episode
            env1 = new_env()
            state, _= env1.create_env() # get only state            
            reward_sum = 0      
            goal_ = True
            max_qlist = []
            while not _fail:
                e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPI)
                # after sufficient learning, we present the game scene
                if episode > MAX_EPI:
                    game.update(state)
                    game.render_env()
                    
                step_count += 1
                if np.random.rand()< e:
                    act_candi1 = range(ACTION_NUM)
                    act_candi2 = range(ACTION_NUM)
                    if state[0]<TIME_GAP*VEL:
                        act_candi1.remove(2) # remove action 2(left)
                    if state[1]<TIME_GAP*VEL:
                        act_candi1.remove(1) # remove action 1(down)
                    if state[0] > MAP_SIZE - TIME_GAP*VEL:
                        act_candi1.remove(3) # remove action 3(right)
                    if state[1] > MAP_SIZE - TIME_GAP*VEL:
                        act_candi1.remove(0) # remove action 0(up)    
                    if state[2]<TIME_GAP*VEL:
                        act_candi2.remove(2) # remove action 2(left)
                    if state[3]<TIME_GAP*VEL:
                        act_candi2.remove(1) # remove action 1(down)
                    if state[2] > MAP_SIZE - TIME_GAP*VEL:
                        act_candi2.remove(3) # remove action 3(right)
                    if state[3] > MAP_SIZE - TIME_GAP*VEL:
                        act_candi2.remove(0) # remove action 0(up)    
                    act_candi1_ = np.array(act_candi1)
                    act_candi2_ = np.array(act_candi2)

                    action_l1 = random.sample(act_candi1, 1) # choose up, down, left, right, stop for agent 1
                    action_l2 = random.sample(act_candi2, 1) # for agent 2
                    action = action_l1[0]*(ACTION_NUM) + action_l2[0] # convert to index
                else:
                    act_candi = mainDQN.predict(state) # [[1 2 3 ... as much as OUTPUT_SIZE]]
                    act_candi = act_candi.flatten()

                    if state[0]<TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[2*ACTION_NUM+i] = -float("inf") # put a large num on action 2(left)
                    if state[1]<TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[1*ACTION_NUM+i] = -float("inf") # put a large num on action 1(down)
                    if state[0] > MAP_SIZE - TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[3*ACTION_NUM+i] = -float("inf") # remove action 3(right)
                    if state[1] > MAP_SIZE - TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[0*ACTION_NUM+i] = -float("inf") # remove action 0(up)  
                    if state[2]<TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[i*ACTION_NUM+2] = -float("inf")# put a large num on action 2(left)
                    if state[3]<TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[i*ACTION_NUM+1] = -float("inf") # put a large num on action 1(down)
                    if state[2] > MAP_SIZE - TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[i*ACTION_NUM+3] = -float("inf")# remove action 3(right)
                    if state[3] > MAP_SIZE - TIME_GAP*VEL:
                        for i in range(ACTION_NUM):
                            act_candi[i*ACTION_NUM+0] = -float("inf") # remove action 0(up)    

                    action = np.argmax(act_candi)   
                    '''
                    dd_predict = mainDQN.predict(state).flatten()
                    aa = np.max(dd_predict)
                    max_indx, = np.where(dd_predict==aa)                    
                    action = random.sample(max_indx,1)[0]
                    '''
                
                n_state, reward, _fail = env1.next_step(state, action) # have to input the action 
                # if count >30, stop that episode and start new episode
                if step_count >MAX_STEP-1:
                    #reward = -30
                    _fail = True
                    goal_ = False
                    
                reward_sum += DISCOUNT_RATE**step_count * reward    # sum total reward and penalty about long time(-0.5)     
                
                replay_buffer.append((state, action, reward, n_state, _fail)) #resolve the correlation                
                if _fail == True and goal_ == True:
                    success_tuple = (state, action, reward, n_state, _fail)
                state = n_state
                
                q_values = mainDQN.predict(state) # [[1 2 3 ... as much as OUTPUT_SIZE]]
                q_values = q_values.flatten()
                max_q = np.max(np.array(q_values))
                max_qlist.append(max_q)
                # train minibatch of main Q-NET and update the  Q-network from main Q-NET
                if len(replay_buffer)>BATCH_SIZE*3:
                    minibatch = random.sample(replay_buffer, BATCH_SIZE) 
                    train_minibatch(mainDQN, targetDQN, minibatch) # training number = step number
                if step_count % TARGET_UPDATE_FQ == 0:
                    sess.run(copy_ops)
            
            avg_q_value.append(np.mean(max_qlist))
            total_reward_buffer.append(reward_sum)  
            step_buffer.append(step_count)
            if goal_ == True:       
                print("[Episode {:>5}]  total reward: {:>5}    steps: {:>5}    q:{:>5}, success".format(episode, reward_sum, step_count, np.mean(max_qlist)))
            else:
                print("[Episode {:>5}]  total reward: {:>5}     steps: {:>5}    q:{:>5}, failure".format(episode, reward_sum, step_count, np.mean(max_qlist)))
        #print("Success ratio: {}".format(reward_accum_last100/100))
        fig1 =plt.figure()
        plt.plot(range(MAX_EPI), total_reward_buffer)
        plt.show()
        
        # save model  
        #new_saver = tf.train.Saver()
        save_path = new_saver.save(sess, "./dqn_multi_reward_moderate_500.ckpt")   
        # save data (reward, step)
        f = open("multi_reward_moderate_500.txt", 'w')
        for i in range(len(total_reward_buffer)):
            f.write("{:>5}  {:>5}  {:>5}\n".format(total_reward_buffer[i], step_buffer[i], avg_q_value[i]))
        f.close
        
if __name__ == "__main__":
    main()    
    end = time.time()-start
    print(end)       



  from ._conv import register_converters as _register_converters


INFO:tensorflow:Restoring parameters from ./dqn_multi_reward_easy_500.ckpt


<IPython.core.display.Javascript object>

[Episode     0]  total reward: -419.744626502     steps:   500    q:-11.9557304382, failure
[Episode     1]  total reward: -67.0402747168     steps:   500    q:-28.2305870056, failure
[Episode     2]  total reward: -330.717790742     steps:   500    q:-26.8924407959, failure
[Episode     3]  total reward: -136.813156577     steps:   500    q:-44.4332199097, failure
[Episode     4]  total reward: -117.113903241     steps:   500    q:-64.8000030518, failure
[Episode     5]  total reward: -64.343982006     steps:   500    q:-53.1013298035, failure
[Episode     6]  total reward: -346.284768105     steps:   500    q:-90.561668396, failure
[Episode     7]  total reward: -309.432172739    steps:   463    q:-79.864151001, success
[Episode     8]  total reward: -162.782653411     steps:   500    q:-64.3104171753, failure
[Episode     9]  total reward: -98.443486299     steps:   500    q:-70.1088943481, failure
[Episode    10]  total reward: -241.760195027     steps:   500    q:-80.8323974609, f

[Episode    90]  total reward: -392.173063676     steps:   500    q:-118.904579163, failure
[Episode    91]  total reward: -138.510663296     steps:   500    q:-95.3542785645, failure
[Episode    92]  total reward: -330.827598648     steps:   500    q:-93.028793335, failure
[Episode    93]  total reward: -529.552476873     steps:   500    q:-75.7586212158, failure
[Episode    94]  total reward: -402.290384006     steps:   500    q:-138.994827271, failure
[Episode    95]  total reward: -622.676494489     steps:   500    q:-123.739906311, failure
[Episode    96]  total reward: -435.72183739     steps:   500    q:-92.6995544434, failure
[Episode    97]  total reward: -303.462160313     steps:   500    q:-87.3437271118, failure
[Episode    98]  total reward: -428.236424026     steps:   500    q:-70.2235717773, failure
[Episode    99]  total reward: -440.825546691     steps:   500    q:-98.8034973145, failure
[Episode   100]  total reward: -306.56857408     steps:   500    q:-97.3634109497,

[Episode   180]  total reward: -320.008978081     steps:   500    q:-101.043563843, failure
[Episode   181]  total reward: -407.962235421     steps:   500    q:-147.774368286, failure
[Episode   182]  total reward: -522.677456628     steps:   500    q:-152.371139526, failure
[Episode   183]  total reward: -429.538529611     steps:   500    q:-163.859313965, failure
[Episode   184]  total reward: -550.869534409     steps:   500    q:-156.170150757, failure
[Episode   185]  total reward: -551.464277762     steps:   500    q:-195.62840271, failure
[Episode   186]  total reward: -366.255644196     steps:   500    q:-177.950042725, failure
[Episode   187]  total reward: -282.054227039     steps:   500    q:-112.253753662, failure
[Episode   188]  total reward: -567.398062353     steps:   500    q:-209.266555786, failure
[Episode   189]  total reward: -361.959073846     steps:   500    q:-154.42755127, failure
[Episode   190]  total reward: -318.47543107     steps:   500    q:-168.627639771,

[Episode   270]  total reward: 745.003636627    steps:    14    q:311.657928467, success
[Episode   271]  total reward: 34.2780772605    steps:    88    q:201.088973999, success
[Episode   272]  total reward: -155.557985831    steps:   294    q:-26.8141441345, success
[Episode   273]  total reward: 701.935038554    steps:    17    q:370.459625244, success
[Episode   274]  total reward: 643.88790836    steps:    20    q:293.881652832, success
[Episode   275]  total reward: 759.771537299    steps:    13    q:264.90814209, success
[Episode   276]  total reward: -264.985341956     steps:   500    q:-53.3384819031, failure
[Episode   277]  total reward: 814.486219523    steps:    10    q:457.897705078, success
[Episode   278]  total reward: -440.553361266    steps:   430    q:-152.910247803, success
[Episode   279]  total reward: -426.168495641     steps:   500    q:-106.005310059, failure
[Episode   280]  total reward: 447.778828364    steps:    31    q:394.6902771, success
[Episode   281]

[Episode   362]  total reward: -460.049973502     steps:   500    q:-111.42527771, failure
[Episode   363]  total reward: -599.951678976     steps:   500    q:-32.0063819885, failure
[Episode   364]  total reward: -310.359291577    steps:   195    q:174.060119629, success
[Episode   365]  total reward: -123.231794551    steps:   184    q:17.9128360748, success
[Episode   366]  total reward: -359.506917757    steps:   127    q:157.77116394, success
[Episode   367]  total reward: -353.895650418     steps:   500    q:-126.830474854, failure
[Episode   368]  total reward: -362.089541812     steps:   500    q:-106.32447052, failure
[Episode   369]  total reward: -280.743415585     steps:   500    q:-93.1070785522, failure
[Episode   370]  total reward: -376.02713769     steps:   500    q:22.1578598022, failure
[Episode   371]  total reward: -526.23851549     steps:   500    q:57.1808433533, failure
[Episode   372]  total reward: -143.755584835     steps:   500    q:-97.9694290161, failure
[

[Episode   453]  total reward: -102.167384247     steps:   500    q:-56.7544937134, failure
[Episode   454]  total reward: -76.1150785379     steps:   500    q:-69.1335449219, failure
[Episode   455]  total reward: -211.545991232     steps:   500    q:-77.1495437622, failure
[Episode   456]  total reward: -399.402245038     steps:   500    q:-27.2617263794, failure
[Episode   457]  total reward: -256.164342291     steps:   500    q:-43.0661087036, failure
[Episode   458]  total reward: -390.886819111     steps:   500    q:-126.803688049, failure
[Episode   459]  total reward: -440.532934348     steps:   500    q:-66.7080230713, failure
[Episode   460]  total reward: -285.915468306     steps:   500    q:-36.9842224121, failure
[Episode   461]  total reward: -420.754289555     steps:   500    q:-43.3149147034, failure
[Episode   462]  total reward: -77.2404040191     steps:   500    q:46.264087677, failure
[Episode   463]  total reward: -349.532457505     steps:   500    q:-0.12300781160

<IPython.core.display.Javascript object>

4830.51966095


In [3]:
'''
# two plot (reward, step)

file = open('multi_reward_easy_500.txt', 'r')    # hello.txt 파일을 읽기 모드(r)로 열기. 파일 객체 반환
s = file.read().split( )                  # 파일에서 문자열 읽기
reward = []
step = []
q = []
for i in range(0,len(s),3):
    reward.append(float(s[i]))
    step.append(int(s[i+1]))
    q.append(float(s[i+2]))
                         # Hello, world!
file.close()                     # 파일 객체 닫기

fig1 =plt.figure()
plt.plot(range(len(reward)),reward, lw =0.6)
fig2 =plt.figure()
plt.plot(range(len(step)),step, lw=0.3)
fig3 =plt.figure()
plt.plot(range(len(q)),q, lw=0.6)
plt.show()

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-3-0faa2b419ff3>, line 22)