In [None]:
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import itertools
import os
from IPython import display
import time
import math
%matplotlib inline

In [None]:
class gameOb():
    def __init__(self,coordinates,size,intensity,channel,reward,name):
        self.x = coordinates[0]
        self.y = coordinates[1]
        self.size = size
        self.intensity = intensity
        self.channel = channel
        self.reward = reward
        self.name = name
        
class gameEnv():
    def __init__(self,partial,size):
        self.sizeX = size
        self.sizeY = size
        self.actions = 4
        self.objects = []
        self.partial = partial
        self.a = self.reset()
        
    def reset(self):
        self.objects = []
        hero = gameOb(self.newPosition(),1,1,2,None,'hero')
        self.objects.append(hero)
        bug = gameOb(self.newPosition(),1,1,1,10,'goal')
        self.objects.append(bug)
        hole = gameOb(self.newPosition(),1,1,0,-2,'fire')
        self.objects.append(hole)
        hole2 = gameOb(self.newPosition(),1,1,0,-2,'fire')
        self.objects.append(hole2)
        self.a = self.renderEnv()
        return self.a

    def moveChar(self,direction):
        # 0 - up, 1 - down, 2 - left, 3 - right
        hero = self.objects[0]
        heroX = hero.x
        heroY = hero.y
        penalize = 0.00
        if direction == 0 and hero.y >= 1:
            hero.y -= 1
        if direction == 1 and hero.y <= self.sizeY-2:
            hero.y += 1
        if direction == 2 and hero.x >= 1:
            hero.x -= 1
        if direction == 3 and hero.x <= self.sizeX-2:
            hero.x += 1     
        if hero.x == heroX and hero.y == heroY:
            penalize = 1
        self.objects[0] = hero
        return penalize
    
    def newPosition(self):
        iterables = [ range(self.sizeX), range(self.sizeY)]
        points = []
        for t in itertools.product(*iterables):
            points.append(t)
        currentPositions = []
        for objectA in self.objects:
            if (objectA.x,objectA.y) not in currentPositions:
                currentPositions.append((objectA.x,objectA.y))
        for pos in currentPositions:
            points.remove(pos)
        location = np.random.choice(range(len(points)),replace=False)
        return points[location]

    def checkGoal(self):
        others = []
        for obj in self.objects:
            if obj.name == 'hero':
                hero = obj
            else:
                others.append(obj)
        ended = False
        for other in others:
            if hero.x == other.x and hero.y == other.y:
                self.objects.remove(other)
                if other.reward == 1:
                    self.objects.append(gameOb(self.newPosition(),1,1,1,10,'goal'))
                else: 
                    self.objects.append(gameOb(self.newPosition(),1,1,0,-2,'fire'))
                return other.reward,True
        if ended == False:
            return 0.0,False

    def renderEnv(self):
        #a = np.zeros([self.sizeY,self.sizeX,3])
        self.a = np.ones([self.sizeY+2,self.sizeX+2,3])
        self.a[1:-1,1:-1,:] = 0
        hero = None
        for item in self.objects:
            self.a[item.y+1:item.y+item.size+1,item.x+1:item.x+item.size+1,item.channel] = item.intensity
            if item.name == 'hero':
                hero = item
        if self.partial == True:
            self.a = self.a[hero.y:hero.y+3,hero.x:hero.x+3,:]
        b = scipy.misc.imresize(self.a[:,:,0],[84,84,1],interp='nearest')
        c = scipy.misc.imresize(self.a[:,:,1],[84,84,1],interp='nearest')
        d = scipy.misc.imresize(self.a[:,:,2],[84,84,1],interp='nearest')
        self.a = np.stack([b,c,d],axis=2)
        return self.a

    def step(self,action):
        penalty = self.moveChar(action)
        reward,done = self.checkGoal()
        if(penalty != 0):
            done = True
        state = self.renderEnv()
        return state,(reward-penalty),done

In [None]:
env = gameEnv(partial=False,size=7)

In [None]:
class Qnetwork():
    def __init__(self,h_size,prev_states):
#         self.image_to_resize = tf.placeholder(shape=[1,84,84,3], dtype=tf.float32,name='image_to_resize')
#         self.y = tf.identity(self.image_to_resize)
#         self.resized_image = tf.image.resize_images(self.y, 84, 84)
        
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        self.imageIn = tf.placeholder(shape=[None,84,84,3],dtype=tf.float32)
        self.conv1 = tf.contrib.layers.convolution2d( \
            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = tf.contrib.layers.convolution2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = tf.contrib.layers.convolution2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
        self.conv4 = tf.contrib.layers.convolution2d( \
            inputs=self.conv3,num_outputs=512,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None)
        
        #We take the output from the final convolutional layer and split it into separate advantage and value streams.
        self.streamAC,self.streamVC = tf.split(3,2,self.conv4)
        self.streamA = tf.contrib.layers.flatten(self.streamAC)
        self.streamV = tf.contrib.layers.flatten(self.streamVC)
        self.AW = tf.Variable(tf.random_normal([h_size//2,env.actions]))
        self.VW = tf.Variable(tf.random_normal([h_size//2,1]))
        self.Advantage = tf.matmul(self.streamA,self.AW)
        self.Value = tf.matmul(self.streamV,self.VW)
        
        #Then combine them together to get our final Q-values.
        self.Qout = self.Value + tf.sub(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,env.actions,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        self.updateModel = self.trainer.minimize(self.loss)

In [None]:
class experience_buffer():
    """Used to store experiences and samples randomly to train the network."""
    def __init__(self, buffer_size=50000):
        self.buffer_size = buffer_size
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        
    def add(self, states, actions, rewards, dones):
        if len(self.actions) == self.buffer_size:
            self.states = self.states[1:]
            self.actions = self.actions[1:]
            self.rewards = self.rewards[1:]
            self.dones = self.dones[1:]

        self.states.append(states)
        self.actions.append(actions)
        self.rewards.append(rewards)
        self.dones.append(dones)
        
    def sample(self, size, previous_states):
        samples = np.random.permutation(len(self.actions)-(previous_states-1)) + (previous_states-1)

        states = []
        actions = []
        rewards = []
        states_ = []
        dones = []
        for i in samples[:size]:
            temp = []
            for j in range(previous_states):
                temp.append(self.states[i - previous_states + j + 1])
            states.append(np.dstack(temp))
            actions.append(self.actions[i])
            rewards.append(self.rewards[i])
            states_.append(self.states[i+1])
            dones.append(self.dones[i])
                
        return states, actions, rewards, states_, dones

In [None]:
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

In [None]:
batch_size = 32 #How many experiences to use for each training step.
update_freq = 8 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
anneling_steps = 75000 #How many steps of training to reduce startE to endE.
num_episodes = 100003 #How many episodes of game environment to train network with.
pre_train_steps = 20000 #How many steps of random actions before training begins.
pre_train_steps_from_Q = False #If true, initialize buffer with steps from Q instead of random actions
max_epLength = 20 #The max allowed length of our episode.
load_model = False #Whether to load a saved model.
path = "./dqn/save_data/gridWorld/seven_by_seven/" #The path to save our model to.
h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.0001 #Rate to update target network toward primary network
previous_states=1

In [None]:
def distance(x1, y1, x2, y2):
    return math.sqrt((x2-x1)**2+(y2-y1)**2)

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size,previous_states)
targetQN = Qnetwork(h_size,previous_states)
t_start = time.time()

init = tf.initialize_all_variables()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()

#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/anneling_steps

#arrays to save
eps_arr = []
time_arr = []
err_arr = []
annel_arr = []

#create lists to contain total rewards and steps per episode
step_list = []
reward_list = []
total_steps = 0
hero_x = 0
hero_y = 0
least_distance = 100

#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

with tf.Session() as sess:
    if load_model == True:
        print('Loading Model...')
        load = './dqn/awjuliani/algorithm_train1/model-70000.cptk'
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess,load)
    sess.run(init)
    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
    for i in range(1, num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
#         s = np.reshape(s, [-1, 84, 84, 3])
        if(i==1):
            myBuffer.states.append(s)
        d = False
        reward_sum = 0
        step = 0
        #The Q-Network
        while step < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            step+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:       
                goal_x = 0
                goal_y = 0
                for obj in env.objects:
                    if(obj.name == 'hero'):
                        hero_x = obj.x
                        hero_y = obj.y
                    if(obj.name == 'goal'):
                        dist = distance(hero_x,hero_y,obj.x,obj.y)
                        if(dist<least_distance):
                            lesast_distance = dist
                            goal_x = obj.x
                            goal_y = obj.y

                            x_dir = hero_x - goal_x
                            y_dir = hero_y - goal_y

                            if(y_dir > 0):
                                a = 0
                            elif(y_dir < 0):
                                a = 1
                            elif(x_dir > 0):
                                a = 2
                            elif(x_dir < 0):
                                a = 3
            else:
                a = sess.run(mainQN.predict,feed_dict={mainQN.imageIn:[s]})[0]
            s1,r,d = env.step(a)
#             s1 = np.reshape(s1, [-1, 84, 84, 3])
            total_steps += 1
            myBuffer.add(s1,a,r,d) #Save the experience to our episode buffer.

            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop

                if total_steps % (update_freq) == 0:
                    states, actions, rewards, state_, done = myBuffer.sample(batch_size,previous_states) #Get a random batch of experiences.
                    #Below we perform the Double-DQN update to the target Q-values
                    Q1 = sess.run(mainQN.predict,feed_dict={mainQN.imageIn:state_})
                    Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.imageIn:state_})
                    end_multiplier = -(np.array(done) - 1)
                    doubleQ = Q2[range(batch_size),Q1]
                    targetQ = rewards + (y*doubleQ * end_multiplier)
                    #Update the network with our target values.
                    _ = sess.run(mainQN.updateModel, \
                        feed_dict={mainQN.imageIn:states,mainQN.targetQ:targetQ, mainQN.actions:actions})

                    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
            reward_sum += r
            s = s1

            if d == True:

                break

        #Get all experiences from this episode and discount their rewards. 
        step_list.append(step)
        reward_list.append(reward_sum)
        #Periodically save the model. 
        if i % 1000 == 0:
            print(str(i), " -- avg steps ",np.mean(step_list[-1000:]), " -- %complete ", (len([k for k in reward_list[-1000:] if k > 0]) / len(reward_list[-1000:])), ', anneling -- ', e)
            eps_arr.append(i)
            time_arr.append(time.time() - t_start)
            err_arr.append((len([k for k in reward_list[-100:] if k > 0]) / len(reward_list[-100:])))
            annel_arr.append(e)
        if i % 10000 == 0:
            saver.save(sess,path+'model-'+str(i)+'.cptk')
            np.savez(path+'data' + str(i)+ '.npz', episode=eps_arr, time=time_arr, error = err_arr, anneling = annel_arr)
            print("Saved Model @ ", path+'model-'+str(i)+'.cptk')
            
    saver.save(sess,path+'model-'+str(i)+'.cptk')
    np.savez(path+'data' + str(i)+ '.npz', episode=eps_arr, time=time_arr, error = err_arr, anneling = annel_arr)
print("Percent of succesful episodes: " + str(sum(reward_list)/num_episodes) + "%")

In [None]:
directions = ['up', 'down', 'left', 'right']
arr = []

mainQN = Qnetwork(h_size, previous_states)
targetQN = Qnetwork(h_size, previous_states)

init = tf.initialize_all_variables()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    load = './dqn/save_data/gridWorld/one_go1/model-50000.cptk'
    saver.restore(sess, load)
    for i in range(20):
        complete = False
        iter = 0
        env = gameEnv(partial=False,size=5)
        s = env.reset()
        plt.imshow(env.a)
        r_all = 0
        while(complete == False):
            
            action = sess.run(mainQN.predict, feed_dict={mainQN.imageIn:[s]})
            s, reward, done = env.step(action)
            plt.imshow(env.a)
#             display.clear_output(wait=True)
#             display.display(plt.gcf())
            r_all += reward
            if(iter > 20):
                done = True
            iter += 1
            if(done):
                arr.append(r_all)
                complete = True
                print(iter)
                print("Complete")
                print("reward: ", r_all)
#             time.sleep(.5)
            plt.show()

(len([k for k in arr[-100:] if k > 0]) / len(arr[-100:]))