In [1]:
import gym
import numpy as np
import pickle # as pickle
import tensorflow as tf
import random
OP_NUMBER = 2
STATE_INPUT_SIZE = 4
HIDDEN_SIZE = 16
TOTAL_EPISODES = 30000
BATCH_NUMBER = 10
gamma = 0.99
SUCCESS_GOAL = 180
MAX_STEPS = 700

#ADAM Optimizer hyper-parameters:
LEARNING_RATE = 0.01
B1 = 0.8
B2= 0.999
EPSILON=1e-6

OUT_MAX= "ws.p"
DISPLAY_FREQ=250
RENDER_FREQ=False

tf.reset_default_graph()


In [2]:
class Agent():


    def __init__(self):

        #Variables:

        self.W1 = tf.get_variable(shape=[HIDDEN_SIZE,STATE_INPUT_SIZE],name='w1',
                              initializer=tf.contrib.layers.xavier_initializer())
        self.W2 = tf.get_variable(shape=[HIDDEN_SIZE,HIDDEN_SIZE],name='w2',
                              initializer=tf.contrib.layers.xavier_initializer())
        self.W3 = tf.get_variable(shape=[OP_NUMBER,HIDDEN_SIZE],name='w3',
                              initializer=tf.contrib.layers.xavier_initializer())

        self.b1 = tf.get_variable(shape=[HIDDEN_SIZE,1],name='b1',
                              initializer=tf.contrib.layers.xavier_initializer())
        self.b2 = tf.get_variable(shape=[HIDDEN_SIZE,1],name='b2',
                              initializer=tf.contrib.layers.xavier_initializer())
        self.b3 = tf.get_variable(shape=[OP_NUMBER,1],name='b3',
                              initializer=tf.contrib.layers.xavier_initializer())

        #Layers:
        self.x = tf.placeholder(tf.float32, shape=[STATE_INPUT_SIZE,None],name='x')
        self.h1 = tf.tanh(tf.matmul(self.W1,self.x) + self.b1)
        self.h2 = tf.tanh(tf.matmul(self.W2,self.h1) + self.b2)
        self.y = tf.nn.softmax(tf.matmul(self.W3,self.h2) + self.b3,dim=0)

        #Loss function:

        self.curr_reward = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions_array = tf.placeholder(shape=[None,2],dtype=tf.int32)
        self.pai_array = tf.gather_nd(self.y,self.actions_array)
        self.L = -tf.reduce_mean(tf.log(self.pai_array)*self.curr_reward)
        self.gradient_holders = []
        self.gradients = tf.gradients(self.L,tf.trainable_variables())

        #Initialize gradient lists for each trainable variable:
        tvars = tf.trainable_variables()
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)

        #Update mechanism:
        adam = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE,beta1=B1,beta2=B2,epsilon=EPSILON)
        self.update_batch = adam.apply_gradients(zip(self.gradient_holders,tf.trainable_variables()))
        

    def start(self,sess):
        """
        Agent initialization:
        Initiazlie the gradient buffer for each "trainable variable" 
        """
        init = tf.global_variables_initializer()
        sess.run(init)
        self.grad_buffer = sess.run(tf.trainable_variables())
        for ix,grad in enumerate(self.grad_buffer):
            self.grad_buffer[ix] = grad * 0

        

    def get_action(self,sess,observation):
        """
        Given an observation, return action probabilities (the NN output): 
        """
        a_dist = sess.run(self.y,feed_dict={self.x:np.reshape(observation,(STATE_INPUT_SIZE,1))})
        res = sess.run(self.W1,feed_dict={self.x:np.reshape(observation,(STATE_INPUT_SIZE,1))})
        r = np.random.rand()
        a = np.random.choice(range(OP_NUMBER),p=a_dist.reshape((OP_NUMBER)))
        return a

    def train(self,sess,states_array,actions_array,curr_reward):
        """
        NN training procedure: Given arrays of states(observations),
        actions and rewards it computes the derivatives of the loss function
        then add the derivation values to the buffer, 
        """

        G = sess.run(self.gradients,feed_dict={self.x:states_array,self.actions_array:actions_array,self.curr_reward:curr_reward})
        for idx,grad in enumerate(G):
            self.grad_buffer[idx] += grad
        

    def update(self,sess):
        """
        NN update procedure: apply the gradients to the NN variables
        """
        feed_dict = dict(zip(self.gradient_holders, self.grad_buffer))
     
        _ = sess.run(self.update_batch, feed_dict=feed_dict)
        for ix,grad in enumerate(self.grad_buffer):
            self.grad_buffer[ix] = grad * 0

    def save(self,sess,path):
        outfile = open(path,"wb")
        param = sess.run([self.W1,self.b1,self.W2,self.b2,self.W3,self.b3])
        pickle.dump(param,outfile)
        outfile.close()
        




In [3]:
def discount_rewards(arr):
    """
    Helper function for computing discounted rewards,
    then the delayed rewards are normalized by the mean and std as requested.
    """
    discounts = np.zeros_like(arr)
    reward = 0
    for i in reversed(range(arr.size)):
        reward=gamma*(arr[i]+reward)
        discounts[i] = reward
    mean = np.mean(discounts,keepdims=True)    
    discounts = discounts - mean
    discounts = discounts/ np.std(discounts)
    return discounts

    

In [4]:

env_d = 'CartPole-v0'
env = gym.make(env_d)
total_episodes=TOTAL_EPISODES
batch_number = BATCH_NUMBER
agent = Agent()
    


[2017-08-15 23:57:22,032] Making new env: CartPole-v0


In [5]:
sess = tf.InteractiveSession()

In [6]:
agent.start(sess)
episode_number = 0

ep_history = []
step_num=0
total_reward=0
rewards = []
steps=[]
max_reward=0

obsrv = env.reset() # Start the 1st game, recieving the 1st state


In [7]:
done = False

while not done and step_num<MAX_STEPS:
    step_num+=1
    action = agent.get_action(sess,obsrv)
    obsrv1, reward, done, info = env.step(action)

    total_reward+=reward
    ep_history.append((np.array(obsrv),action,reward))
    obsrv=obsrv1

In [8]:
ep_history

[(array([ 0.04099542,  0.01682624,  0.04236995, -0.00939053]), 0, 1.0),
 (array([ 0.04133195, -0.17887694,  0.04218214,  0.29635381]), 0, 1.0),
 (array([ 0.03775441, -0.37457405,  0.04810921,  0.60203623]), 1, 1.0),
 (array([ 0.03026293, -0.18015689,  0.06014994,  0.32488656]), 0, 1.0),
 (array([ 0.02665979, -0.3760814 ,  0.06664767,  0.63591527]), 0, 1.0),
 (array([ 0.01913816, -0.57206647,  0.07936597,  0.94882011]), 0, 1.0),
 (array([ 0.00769683, -0.76816205,  0.09834238,  1.26534667]), 0, 1.0),
 (array([-0.00766641, -0.96439351,  0.12364931,  1.58713775]), 0, 1.0),
 (array([-0.02695428, -1.1607491 ,  0.15539206,  1.91568275]), 0, 1.0),
 (array([-0.05016926, -1.35716553,  0.19370572,  2.25225907]), 0, 1.0)]

In [9]:
episode_number+=1
ep_history= np.array(ep_history)               
ep_history[:,2] = discount_rewards(ep_history[:,2])
episode_number += 1
obsrv = env.reset()

"""
perform the training step, 
feeding the network with the ep_history that contains
the states,actions, and discounted rewards
"""
L=agent.train(sess,np.vstack(ep_history[:,0]).T,
    np.dstack((ep_history[:,1].T,np.array(range(step_num))))[0],
    ep_history[:,2].T)


In [19]:
states_array = np.vstack(ep_history[:,0]).T
states_array

array([[ 0.04099542,  0.04133195,  0.03775441,  0.03026293,  0.02665979,
         0.01913816,  0.00769683, -0.00766641, -0.02695428, -0.05016926],
       [ 0.01682624, -0.17887694, -0.37457405, -0.18015689, -0.3760814 ,
        -0.57206647, -0.76816205, -0.96439351, -1.1607491 , -1.35716553],
       [ 0.04236995,  0.04218214,  0.04810921,  0.06014994,  0.06664767,
         0.07936597,  0.09834238,  0.12364931,  0.15539206,  0.19370572],
       [-0.00939053,  0.29635381,  0.60203623,  0.32488656,  0.63591527,
         0.94882011,  1.26534667,  1.58713775,  1.91568275,  2.25225907]])

In [21]:
actions_array = np.dstack((ep_history[:,1].T,np.array(range(step_num))))[0]
actions_array

array([[0, 0],
       [0, 1],
       [1, 2],
       [0, 3],
       [0, 4],
       [0, 5],
       [0, 6],
       [0, 7],
       [0, 8],
       [0, 9]], dtype=object)

In [22]:
curr_reward = ep_history[:,2].T
curr_reward

array([1.5457281502797957, 1.211399621726729, 0.8736940373296919,
       0.5325772854134928, 0.1880149097405646, -0.16002789396946407,
       -0.5115862815553514, -0.8666957639653387, -1.2253922108441135,
       -1.5877118541560076], dtype=object)

In [26]:
sess.run(agent.y,feed_dict={agent.x:states_array,agent.actions_array:actions_array,agent.curr_reward:curr_reward})

array([[ 0.75403732,  0.79626626,  0.83186954,  0.7947346 ,  0.83072323,
         0.85913491,  0.88004708,  0.89437604,  0.90350097,  0.90882754],
       [ 0.24596262,  0.20373371,  0.16813044,  0.20526536,  0.16927676,
         0.14086504,  0.11995293,  0.10562398,  0.09649906,  0.09117244]], dtype=float32)

In [27]:
sess.run(agent.pai_array,feed_dict={agent.x:states_array,agent.actions_array:actions_array,agent.curr_reward:curr_reward})

array([ 0.75403732,  0.79626626,  0.16813044,  0.7947346 ,  0.83072323,
        0.85913491,  0.88004708,  0.89437604,  0.90350097,  0.90882754], dtype=float32)