In [1]:
import tensorflow as tf

from collections import deque

import numpy as np

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import random

  from ._conv import register_converters as _register_converters


In [2]:
# maze is a 2d Numpy array of floats between 0.0 to 1.0
# 1.0 corresponds to a free cell, and 0.0 an occupied cell
# rat = (row, col) initial rat position (defaults to (0,0))

class Qmaze(object):
    def __init__(self, maze, rat=(0,0)):
        self._maze = np.array(maze)
        nrows, ncols = self._maze.shape
        self.target = (nrows-1, ncols-1)   # target cell where the "cheese" is
        self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
        self.free_cells.remove(self.target)
        if self._maze[self.target] == 0.0:
            raise Exception("Invalid maze: target cell cannot be blocked!")
        if not rat in self.free_cells:
            raise Exception("Invalid Rat Location: must sit on a free cell")
        self.reset(rat)

    def reset(self, rat):
        self.rat = rat
        self.maze = np.copy(self._maze)
        nrows, ncols = self.maze.shape
        row, col = rat
        self.maze[row, col] = rat_mark
        self.state = (row, col, 'start')
        self.min_reward = 2* self.maze.size
        self.total_reward = 0
        self.visited = set()

    def update_state(self, action):
        nrows, ncols = self.maze.shape
        nrow, ncol, nmode = rat_row, rat_col, mode = self.state

        if self.maze[rat_row, rat_col] > 0.0:
            self.visited.add((rat_row, rat_col))  # mark visited cell

        valid_actions = self.valid_actions()
                
        if not valid_actions:
            nmode = 'blocked'
        elif action in valid_actions:
            nmode = 'valid'
            if action == LEFT:
                ncol -= 1
            elif action == UP:
                nrow -= 1
            if action == RIGHT:
                ncol += 1
            elif action == DOWN:
                nrow += 1
        else:                  # invalid action, no change in rat position
            mode = 'invalid'

        # new state
        self.state = (nrow, ncol, nmode)

    def get_reward(self):
        rat_row, rat_col, mode = self.state
        nrows, ncols = self.maze.shape
        if rat_row == nrows-1 and rat_col == ncols-1:
            return 1
        return 0

    def act(self, action):
        self.update_state(action)
        reward = self.get_reward()
        self.total_reward += reward
        status = self.game_status()
        envstate = self.observe()
        return envstate, reward, status

    def observe(self):
        canvas = self.draw_env()
        envstate = canvas.reshape((1, -1))
        return envstate

    def draw_env(self):
        canvas = np.copy(self.maze)
        nrows, ncols = self.maze.shape
        # clear all visual marks
        for r in range(nrows):
            for c in range(ncols):
                if canvas[r,c] > 0.0:
                    canvas[r,c] = 1.0
        # draw the rat
        row, col, valid = self.state
        canvas[row, col] = rat_mark
        return canvas

    def game_status(self):
        rat_row, rat_col, mode = self.state
        nrows, ncols = self.maze.shape
        if rat_row == nrows-1 and rat_col == ncols-1:
            return 'win'

        return 'not_over'

    def valid_actions(self, cell=None):
        if cell is None:
            row, col, mode = self.state
        else:
            row, col = cell
        actions = [0, 1, 2, 3]
        nrows, ncols = self.maze.shape
        if row == 0:
            actions.remove(1)
        elif row == nrows-1:
            actions.remove(3)

        if col == 0:
            actions.remove(0)
        elif col == ncols-1:
            actions.remove(2)

        if row>0 and self.maze[row-1,col] == 0.0:
            actions.remove(1)
        if row<nrows-1 and self.maze[row+1,col] == 0.0:
            actions.remove(3)

        if col>0 and self.maze[row,col-1] == 0.0:
            actions.remove(0)
        if col<ncols-1 and self.maze[row,col+1] == 0.0:
            actions.remove(2)

        return actions

In [3]:
# Q = tf.constant([[1.,2.,3.],[2.,3.,1.]])
# a_new = tf.argmax(Q, axis=0)
# sess = tf.Session()
# sess.run(a_new)

In [4]:
# define neural net Q_\theta(s,a) as a class

class Qfunction(object):
    
    def __init__(self, obssize, actsize, sess, optimizer,lr=0.5, th = 1):
        """
        obssize: dimension of state space
        actsize: dimension of action space
        sess: sess to execute this Qfunction
        optimizer: 
        """
        # YOUR CODE HERE
        # build the prediction graph
        state = tf.placeholder(tf.float32, [None, obssize])
        
        #Construct a FNN with 1 hidden layer
        Layer1_nodes = 24
        W1 = tf.get_variable('W1',[obssize, Layer1_nodes],initializer = tf.truncated_normal_initializer(stddev=.1))
        b1 = tf.get_variable('b1',[Layer1_nodes],initializer = tf.truncated_normal_initializer(stddev=.1))
        h1 = tf.nn.relu(tf.matmul(state,W1) + b1)
        
#         Layer2_nodes = 24
#         W2 = tf.get_variable('W2',[Layer1_nodes, Layer2_nodes],initializer = tf.truncated_normal_initializer(stddev=.1))
#         b2 = tf.get_variable('b2',[Layer2_nodes],initializer = tf.truncated_normal_initializer(stddev=.1))
#         h2 = tf.nn.relu(tf.matmul(h1, W2) + b2)
        
        W2 = tf.get_variable('W2',[Layer1_nodes, actsize],initializer = tf.truncated_normal_initializer(stddev=.1))
        b2 = tf.get_variable('b2',[actsize],initializer = tf.truncated_normal_initializer(stddev=.1))
        h2 = tf.matmul(h1, W2) + b2        
        
        Qvalues = h2  # make sure it has size [None, actsize]
        
        # build the targets and actions
        # targets represent the terms E[r+gamma Q] in Bellman equations
        # actions represent a_t
        targets = tf.placeholder(tf.float32, [None])
        actions = tf.placeholder(tf.int32, [None])
        actions_one_hot = tf.one_hot(actions, actsize)
        Qpreds = tf.reduce_sum(tf.multiply(h2, actions_one_hot), axis=1) # make sure it has size [None]
        loss_function_1 = tf.reduce_mean(tf.square(Qpreds - targets))
        
        
#         a_new = tf.argmax(Qvalues, axis=1)
#         a_new_one_hot = tf.one_hot(a_new, actsize)
#         a_diff = a_new_one_hot - actions_one_hot
#         loss_function_2 = tf.norm(a_diff, ord = 'fro', axis = (0,1))
        
        
#         Q_olds = tf.placeholder(tf.float32, [None, actsize])
#         dist_new = tf.distributions.Categorical(probs = tf.nn.softmax(Qvalues))
#         dist_old = tf.distributions.Categorical(probs = tf.nn.softmax(Q_olds))
#         loss_function_2 = tf.reduce_sum(tf.distributions.kl_divergence(dist_new, dist_old))
        
#         self.lr = lr
#         self.th = th
        
#         l1 = tf.cast(tf.less(loss_function_2, self.th), tf.float32)
#         l2 = tf.cast(tf.greater(loss_function_2, self.th), tf.float32)
#         self.lr = self.lr *(1.01*l1+0.99*l2)
#         self.lr = 0.999
        
        loss = loss_function_1
        
        # optimization
        self.train_op = optimizer.minimize(loss)
        
        # some bookkeeping
        self.Qvalues = Qvalues
        self.state = state
        self.actions = actions
        self.targets = targets
        self.loss = loss
        self.sess = sess
#         self.a_new = a_new
#         self.Q_olds = Q_olds
        
        
    
    def compute_Qvalues(self, states):
        """
        """
        return self.sess.run(self.Qvalues, feed_dict={self.state: states})

    def train(self, states, actions, targets):
        """
        states: numpy array as input to compute loss (s)
        actions: numpy array as input to compute loss (a)
        targets: numpy array as input to compute loss (Q targets)
        """
        return self.sess.run([self.loss,self.train_op], feed_dict={self.state:states, self.actions:actions, self.targets:targets})

In [5]:
# Implement replay buffer
class ReplayBuffer(object):
    
    def __init__(self, maxlength):
        """
        maxlength: max number of tuples to store in the buffer
        if there are more tuples than maxlength, pop out the oldest tuples
        """
        self.buffer = deque()
        self.number = 0
        self.maxlength = maxlength
    
    def append(self, experience):
        """
        this function implements appending new experience tuple
        experience: a tuple of the form (s,a,r,s^\prime)
        """
        self.buffer.append(experience)
        self.number += 1
        
    def pop(self):
        """
        pop out the oldest tuples if self.number > self.maxlength
        """
        while self.number > self.maxlength:
            self.buffer.popleft()
            self.number -= 1
    
    def sample(self, batchsize):
        """
        this function samples 'batchsize' experience tuples
        batchsize: size of the minibatch to be sampled
        return: a list of tuples of form (s,a,r,s^\prime)
        """
        # YOUR CODE HERE
        if batchsize < self.number:
            minibatch = random.sample(self.buffer, batchsize) 
        else:
            minibatch = random.sample(self.buffer, self.number) 
        return minibatch  # need implementation

In [6]:
def build_target_update(from_scope, to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_scope)
    op = []
    for v1, v2 in zip(from_vars, to_vars):
        op.append(v2.assign(0.2*v1+0.8*v2))
    return op

In [7]:
# class state_matrix(object):
#     def __init__(self, obssize):
#         """
#         obssize: size of state
#         """
#         self.obssize = int(np.sqrt(obssize))
#         self.matrix = np.zeros((self.obssize, self.obssize))
#         self.matrix[0][0] = 1
    
#     def reward(self, state):
#         """
#         state: the state that is just visited
#         """
#         rat_row, rat_col, _ = state
#         if self.matrix[rat_row, rat_col] == 0:
#             self.matrix[rat_row, rat_col] = 1
#             return 1
#         else:
#             return 0
        

In [8]:
maze = np.array([
    [ 1.,  0.,  1.,  1.,  1.,  1.,  1.],
    [ 1.,  1.,  1.,  0.,  0.,  1.,  0.],
    [ 0.,  0.,  0.,  1.,  1.,  1.,  0.],
    [ 1.,  1.,  1.,  1.,  0.,  0.,  1.],
    [ 1.,  0.,  0.,  0.,  1.,  1.,  1.],
    [ 1.,  0.,  1.,  1.,  1.,  1.,  1.],
    [ 1.,  1.,  1.,  0.,  1.,  1.,  1.]
])
# maze = np.array([
#      [ 1.,  0.,  1.,  1.,  1.,  0.,  1.],
#      [ 1.,  1.,  1.,  1.,  0.,  1.,  1.],
#      [ 1.,  1.,  1.,  0.,  1.,  1.,  1.],
#      [ 0.,  1.,  1.,  1.,  1.,  1.,  0.],
#      [ 1.,  1.,  0.,  0.,  1.,  1.,  1.],
#      [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#      [ 1.,  1.,  1.,  1.,  0.,  1.,  1.]
# ])

# maze = np.array([
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
#     [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]
# ])


visited_mark = 0.8  # Cells visited by the rat will be painted by gray 0.8
rat_mark = 0.5      # The current rat cell will be painteg by gray 0.5
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3

# Actions dictionary
actions_dict = {
    LEFT: 'left',
    UP: 'up',
    RIGHT: 'right',
    DOWN: 'down',
}

num_actions = len(actions_dict)

rat_cell = (0,0)

In [9]:
def show(qmaze, file_name = 'Maze'):
    plt.grid('on')
    nrows, ncols = qmaze.maze.shape
    ax = plt.gca()
    ax.set_xticks(np.arange(0.5, nrows, 1))
    ax.set_yticks(np.arange(0.5, ncols, 1))
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    canvas = np.copy(qmaze.maze)
    for row,col in qmaze.visited:
        canvas[row,col] = 0.6
    rat_row, rat_col, _ = qmaze.state
    canvas[rat_row, rat_col] = 0.3   # rat cell
    canvas[nrows-1, ncols-1] = 0.9 # cheese cell
    img = plt.imshow(canvas, interpolation='none', cmap='gray')
    plt.imshow(canvas, interpolation='none', cmap='gray')
    plt.savefig(file_name)
    return img

In [10]:
env = Qmaze(maze)
# show(env)

In [11]:
def process_obs(obs, maze_size = 7):
    obs = obs.reshape((maze_size, maze_size))
    for i in range(maze_size):
        for j in range(maze_size):
            if obs[i,j] == .5:
                return (i, j)

In [12]:
a = (2,2)
np.zeros((7,7))[a]

0.0

In [None]:
# parameter initializations
lr = 1e-3  # learning rate for gradient update
batchsize = 100  # batchsize for buffer sampling
maxlength = 10000  # max number of tuples held by buffer
maxstep = 100

tau = 100  # time steps for target update
episodes = 1000 # number of episodes to run
initialsize = 500  # initial time steps before start updating
epsilon_min = 0.01 # constant for exploration
epsilon_decay = 0.999
gamma = .99  # discount

reward_record = []
evaluation_record = []

# initialize environment
env = Qmaze(maze)
obssize = 49
actsize = num_actions

# initialize tensorflow session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# optimizer
optimizer = tf.train.AdamOptimizer(lr)

# initialize networks
with tf.variable_scope("principal"):
    Qprincipal = Qfunction(obssize, actsize, sess, optimizer)
with tf.variable_scope("target"):
    Qtarget = Qfunction(obssize, actsize, sess, optimizer)

# build ops
update = build_target_update("principal", "target")  # call sess.run(update) to copy
                                                     # from principal to target

# initialization of graph and buffer
sess.run(tf.global_variables_initializer())
buffer = ReplayBuffer(maxlength)
sess.run(update)

# main iteration
# YOUR CODE HERE
counter = 0
epsilon =1.0
for e in range(episodes):
    state_mat = np.zeros((7,7))
    
    env.reset(rat_cell)
    done = 'not_over'
    rewardsum = 0
    epsilon = max(0.1,epsilon * 0.995)
    
    obs = env.observe()
    
    for _ in range(maxstep):
        valid_actions = env.valid_actions()
        if not valid_actions: break
        counter = counter + 1
        
        if np.random.rand() < epsilon:
            action = random.choice(valid_actions)
        else:
            values = Qprincipal.compute_Qvalues(obs)
            action = np.argmax(values)

        obs_, reward, done = env.act(action)
        
        # Store obs in state_matrix
        pro_obs = process_obs(obs)
        state_mat[pro_obs] = 1.0

        # Modefied the reward by state curiosity
        pro_obs_ = process_obs(obs_)
        if state_mat[pro_obs_] == 0.0:
            reward -= 0.04
        elif pro_obs == pro_obs_:
            reward -= 0.75
        else:
            reward -= 0.25
            
        rewardsum += reward
#         print('rewardsum: ', rewardsum)
        
        #Implement buffer replay to store memory
        experience = (obs, action, reward, obs_)
        buffer.append(experience)
        buffer.pop()
        
            
        if counter>initialsize and counter%5 == 0:
            #Sample from stored memory
            Samples = buffer.sample(batchsize)

            #Conpute target_i
            states = []
            actions = []
            targets = []
            for i in range(len(Samples)):
                s_current, action, r, s_next = Samples[i]
                Q_target = Qtarget.compute_Qvalues(s_next)
                states.extend(s_current)
                actions.append(action)
                target = r + gamma * np.max(Q_target)
                targets.append(target)

            #Compute empirical loss, update theta
            loss = Qprincipal.train(states, actions, targets)
            
        #Update target network
        if counter % tau == 0:
            sess.run(update)
        if done == 'win':
            break
            
        #Swap observation
        obs = obs_
    
    reward_record.append(rewardsum)
    
    print('Finished {} episode.reward={}.'.format(e+1,rewardsum))
    show(env, file_name = 'dive_maze/div_Maze'+'test_1_'+str(e+1))
    
    if e % 5 == 0:
        state_mat = np.zeros((7,7))
        
        env.reset(rat_cell)
        done = 'not_over'
        rewardsum = 0
        
        obs = env.observe()
        
        for _ in range(50):
            values = Qprincipal.compute_Qvalues(obs)
            action = np.argmax(values)
            obs_, reward, done = env.act(action)
            
            rewardsum += reward
            
            pro_obs = process_obs(obs)
            state_mat[pro_obs] = 1.0
            
            pro_obs_ = process_obs(obs_)
            if state_mat[pro_obs_] == 0.0:
                reward -= 0.04
            elif pro_obs == pro_obs_:
                reward -= 0.75
            else:
                reward -= 0.25

            
            if done == 'win':
                break
            obs = obs_
        evaluation_record.append(rewardsum)
        print('Evaluation reward={}.'.format(rewardsum))
        show(env, file_name = 'dive_maze_evaluation/div_Maze'+'test_1_'+str(e+1))
        
            

Finished 1 episode.reward=-25.37.
Evaluation reward=0.


Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.
  warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "


Finished 2 episode.reward=-22.27.
Finished 3 episode.reward=-23.819999999999997.
Finished 4 episode.reward=-22.77.
Finished 5 episode.reward=-24.45.
Finished 6 episode.reward=-23.9.
Evaluation reward=0.
Finished 7 episode.reward=-20.879999999999992.
Finished 8 episode.reward=-22.139999999999993.
Finished 9 episode.reward=-23.4.
Finished 10 episode.reward=-21.929999999999993.
Finished 11 episode.reward=-24.769999999999992.
Evaluation reward=0.
Finished 12 episode.reward=-22.689999999999998.
Finished 13 episode.reward=-22.139999999999993.
Finished 14 episode.reward=-24.45.
Finished 15 episode.reward=-24.399999999999995.
Finished 16 episode.reward=-23.979999999999997.
Evaluation reward=0.
Finished 17 episode.reward=-20.589999999999986.
Finished 18 episode.reward=-25.11.
Finished 19 episode.reward=-26.819999999999997.
Finished 20 episode.reward=-25.479999999999993.
Finished 21 episode.reward=-24.979999999999993.
Evaluation reward=0.
Finished 22 episode.reward=-23.189999999999998.
Finished 

Finished 168 episode.reward=-25.429999999999996.
Finished 169 episode.reward=-17.079999999999988.
Finished 170 episode.reward=-34.37999999999999.
Finished 171 episode.reward=-8.330000000000002.
Evaluation reward=0.
Finished 172 episode.reward=-27.769999999999996.
Finished 173 episode.reward=-32.22.
Finished 174 episode.reward=-29.429999999999993.
Finished 175 episode.reward=-27.899999999999995.
Finished 176 episode.reward=-28.479999999999997.
Evaluation reward=0.
Finished 177 episode.reward=-29.36999999999999.
Finished 178 episode.reward=-25.9.
Finished 179 episode.reward=-23.769999999999996.
Finished 180 episode.reward=-30.78999999999999.
Finished 181 episode.reward=-26.82.
Evaluation reward=0.
Finished 182 episode.reward=-26.479999999999997.
Finished 183 episode.reward=-24.4.
Finished 184 episode.reward=-26.139999999999993.
Finished 185 episode.reward=-29.14.
Finished 186 episode.reward=-29.219999999999995.
Evaluation reward=0.
Finished 187 episode.reward=-21.07999999999999.
Finished

Finished 333 episode.reward=-24.53.
Finished 334 episode.reward=-27.29999999999999.
Finished 335 episode.reward=-23.53.
Finished 336 episode.reward=-36.53999999999999.
Evaluation reward=0.
Finished 337 episode.reward=-23.319999999999997.
Finished 338 episode.reward=-23.319999999999997.
Finished 339 episode.reward=-22.29999999999999.
Finished 340 episode.reward=-21.639999999999993.
Finished 341 episode.reward=-31.29999999999999.
Evaluation reward=0.
Finished 342 episode.reward=-31.349999999999994.
Finished 343 episode.reward=-28.03.
Finished 344 episode.reward=-23.53.
Finished 345 episode.reward=-23.32.
Finished 346 episode.reward=-23.11.
Evaluation reward=0.
Finished 347 episode.reward=-25.429999999999993.
Finished 348 episode.reward=-23.32.
Finished 349 episode.reward=-23.53.
Finished 350 episode.reward=-29.29999999999999.
Finished 351 episode.reward=-23.53.
Evaluation reward=0.
Finished 352 episode.reward=-23.11.
Finished 353 episode.reward=-38.58999999999999.
Finished 354 episode.re

Finished 504 episode.reward=-21.429999999999993.
Finished 505 episode.reward=-11.539999999999996.
Finished 506 episode.reward=-21.849999999999994.
Evaluation reward=0.
Finished 507 episode.reward=-21.589999999999986.
Finished 508 episode.reward=-27.589999999999986.
Finished 509 episode.reward=-11.829999999999995.
Finished 510 episode.reward=-20.59.
Finished 511 episode.reward=-31.38.
Evaluation reward=0.
Finished 512 episode.reward=-29.039999999999992.
Finished 513 episode.reward=-20.58999999999999.
Finished 514 episode.reward=-22.269999999999996.
Finished 515 episode.reward=-20.58999999999999.
Finished 516 episode.reward=-34.90999999999999.
Evaluation reward=0.
Finished 517 episode.reward=-17.829999999999984.
Finished 518 episode.reward=-10.039999999999992.
Finished 519 episode.reward=-20.37999999999999.
Finished 520 episode.reward=-16.829999999999984.
Finished 521 episode.reward=-20.38.
Evaluation reward=0.
Finished 522 episode.reward=-23.95.
Finished 523 episode.reward=-20.589999999

In [None]:

for i in range(200):
    with open('Self_curiosity_rewardsEval_2.dat', 'a') as eval_reward_file:
        if i < 10:
            print(np.mean(evaluation_record[:i]), file=eval_reward_file)
        else:
            print(np.mean(evaluation_record[(i-10):i]), file=eval_reward_file)
    

In [None]:
for i in range(1000):
    with open('Self_curiosity_rewards_2.dat', 'a') as eval_reward_file:
        if i < 10:
            print(np.mean(reward_record[:i+1]), file=eval_reward_file)
        else:
            print(np.mean(reward_record[(i-10):i]), file=eval_reward_file)
    

wzg: create a S*A matrix at the initialization step.  In each sample process, only to replace 

I have to create an S matrix to store all the info about whether a state is visted or not. Use 0 or 1 to indicate the property. And reset the matrix each time when start a new epoch. But how to create S matrix?
so recall the state is indicated by the coordiantes, so I should use a matrix, sqrtS * sqrtS dimension. All zero. each time we got the state output from action(), we change relavent element by adding 1. Let's just treat it as a blackbox function and then think about how to update. just use the name.
after update S matrix, calculate a curiosity reward, and use a function to calculate it. then give the reward to this step.

