Example environment for reinforcement learning

In [1]:
from rl.lib.timer import Timer

import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [2]:
EPSILON = 0.1
NUM_POSITIONS = 10
TARGET_POSITION = 5
MOVE_LEFT = 0
MOVE_RIGHT = 1
NUM_ACTIONS = 2

Demonstrate

- Creating a vector of zeros
- Updating one of those zeros to one.

This is useful for creating one-hot vector representations.

In [3]:
def get_position_vector(position):
    x = tf.Variable([0.] * NUM_POSITIONS, dtype=tf.float32)
    b = tf.scatter_update(x, [position], [1])
    return b


for p in range(NUM_POSITIONS):
    position = tf.Variable(p, dtype=tf.int32)
    position_vector = get_position_vector(position)

    sess.run(tf.global_variables_initializer())      
    print(sess.run([position, position_vector]))


[0, array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)]
[1, array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)]
[2, array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)]
[3, array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)]
[4, array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.], dtype=float32)]
[5, array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.], dtype=float32)]
[6, array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.], dtype=float32)]
[7, array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.], dtype=float32)]
[8, array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.], dtype=float32)]
[9, array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.], dtype=float32)]


Moving the position left or right, and converting to one-hot representation.

In [4]:
def move_left(position):
    return tf.maximum(position-1, tf.Variable(0, dtype=tf.int32))

def move_right(position):
    return tf.minimum(position+1, tf.Variable(NUM_POSITIONS-1, dtype=tf.int32))

position = tf.Variable(5)
left_position = move_left(position)
right_position = move_right(position)

sess.run(tf.global_variables_initializer())  
sess.run([left_position, position, right_position ])

[4, 5, 6]

The apply_action can move the position left or right, depending on what action it is given.

In [5]:
def apply_action(position, action):
    cond = tf.equal(action, MOVE_LEFT)
    return tf.cond(cond, lambda: move_left(position), lambda: move_right(position))

position = tf.Variable(5)
left_position = apply_action(position, MOVE_LEFT)
right_position = apply_action(position, MOVE_RIGHT)

sess.run(tf.global_variables_initializer())
sess.run([left_position, position, right_position ])

[4, 5, 6]

We don't have anything to intelligently move the agent yet. So let's just create a function to move it randomly.

In [6]:
def choose_action(action_values):
    cond = tf.less(EPSILON, tf.random_uniform([1]))
    
    action_if_false = tf.random_uniform([1], maxval=2, dtype=tf.int32)[0]
    action_if_true = tf.argmax(action_values, output_type=tf.int32)
    return tf.cond(cond[0], lambda: action_if_true, lambda: action_if_false)

action_values = tf.constant([1, 2], dtype=tf.int32)
action = choose_action(action_values)

sess.run(tf.global_variables_initializer())

for _ in range(10):
    print(sess.run(action))
    
actions = [sess.run(action) for _ in range(1000)]
# Expect 95% of trues
print(np.mean(actions))

1
1
1
1
1
0
1
1
1
0
0.958


Use tf.while_loop to move the agent at random, until it's position reaches TARGET_POSITION

In [7]:
action_values = tf.constant([1, 2], dtype=tf.int32)
action = choose_action(action_values)
position = tf.Variable(0, tf.int32)

cond = lambda position: tf.not_equal(position, TARGET_POSITION)
op = lambda position: apply_action(position, choose_action(action_values))

r=tf.while_loop(cond, op, [position])

sess.run(tf.global_variables_initializer())
print(r.eval())

5


Expand the prior example so that:

 - We record all positions inside a TensorArray.
 - We return the position using TensorArray.gather.

In [20]:
MAX_EPISODE_LENGTH = 10

def get_walk():
    ''' Generate positions '''
    position = tf.Variable(2, tf.int32)
    count = tf.Variable(0, tf.int32)
    positions = tf.TensorArray(tf.int32, size=MAX_EPISODE_LENGTH)
    positions = positions.write(count, position)

    cond_not_terminal = lambda c, p, ps: tf.not_equal(p, TARGET_POSITION)
    cond_not_max_length = lambda c, p, ps: tf.less(c, MAX_EPISODE_LENGTH-1)
    cond = lambda c, p, ps: cond_not_terminal(c, p, ps) & cond_not_max_length(c, p, ps)

    def body(c, p, positions):
        next_c = c+1
        action_values = tf.random_uniform([2], )
        next_p = apply_action(p, choose_action(action_values))
        positions = positions.write(next_c, next_p)
        return next_c, next_p, positions

    count, position, positions = tf.while_loop(cond, body, (count, position, positions))
    rng = tf.range(count-1)
    gathered_positions = positions.gather(rng)
    return gathered_positions

for _ in range(10):
    positions = get_walk()
    sess.run(tf.global_variables_initializer())
    ps = sess.run(positions)
    print(ps)

[2 1 0 0 1 0 1 2]
[2 3 2 3 2 3]
[2 3 2 3 4 3 2 1]
[2 1 2 1 0 1 0 1]
[2 1 0 1 2 3 4 3]
[2 3 2 3 4 3 4 3]
[2 3]
[2 1 2 3 2 3]
[2 1 0 0 0 0 1 0]
[2 1 0 1 0 0 0 1]


In [22]:
with Timer('Making walks'):
    walks = [ get_walk() for _ in range(10)]

with Timer('Initialising global variables'):
    sess.run(tf.global_variables_initializer())
    
with Timer('Generating walks'):
    ws = sess.run(walks)
print(ws)

Making walks Took 0.71s seconds
Initialising global variables Took 0.27s seconds
Generating walks Took 0.17s seconds
[array([2, 3, 4, 3, 4, 3, 2, 3], dtype=int32), array([2, 1, 2, 1, 0, 0, 1, 2], dtype=int32), array([2, 1, 2, 3, 2, 3], dtype=int32), array([2, 3, 2, 1, 0, 1, 0, 0], dtype=int32), array([2, 1, 0, 0, 1, 2, 1, 0], dtype=int32), array([2, 3], dtype=int32), array([2, 3, 2, 1, 2, 3, 4, 3], dtype=int32), array([2, 3], dtype=int32), array([2, 1, 0, 0, 1, 0, 0, 0], dtype=int32), array([2, 3, 4, 3, 4, 3, 4, 3], dtype=int32)]


In [26]:
NUM_WALKS = 1000

NUM_THREADS = 1

walks = tf.TensorArray(tf.int32, size=NUM_WALKS)
count2 = tf.Variable(0, tf.int32)

cond = lambda c, w: tf.less(c, NUM_WALKS)

def body(c, w):
    next_c = c+1
    www = get_walk()
    w = w.write(c, www)
    return next_c, w

with Timer('Making Walks'):
    count2, walks = tf.while_loop(cond, body, [count2, walks])

with tf.Session() as sess2:

    with Timer('Initialising global variables'):
        sess2.run(tf.global_variables_initializer())

    with Timer('Concatenating'):
        ws = sess2.run([walks.concat()])
        
    print(ws)
    print(ws[0].shape)
        
    with Timer('Building List'):
        l_walks = [walks.read(i) for i in range(NUM_WALKS)]
        
    with Timer('Evaluating List'):        
        ws = sess2.run(l_walks)
        
    print(ws)
    print(len(ws))

Making Walks Took 0.09s seconds
Initialising global variables Took 0.62s seconds
Concatenating Took 0.37s seconds
[array([2, 1, 2, ..., 3, 2, 1], dtype=int32)]
(6697,)
Building List Took 0.97s seconds
Evaluating List Took 0.60s seconds
[array([2, 3, 2, 3, 2, 3, 2, 1], dtype=int32), array([2, 1, 2, 1, 2, 1, 2, 1], dtype=int32), array([2, 3, 2, 3, 2, 1, 2, 1], dtype=int32), array([2, 1, 0, 1, 2, 1, 0, 0], dtype=int32), array([2, 3, 2, 1, 0, 1, 2, 3], dtype=int32), array([2, 1, 2, 1, 0, 1, 2, 1], dtype=int32), array([2, 1, 2, 1, 2, 3], dtype=int32), array([2, 1, 0, 1, 0, 1, 0, 1], dtype=int32), array([2, 1, 0, 1, 2, 3, 2, 1], dtype=int32), array([2, 1, 2, 3], dtype=int32), array([2, 1, 0, 1, 2, 3], dtype=int32), array([2, 3], dtype=int32), array([2, 1, 0, 0, 1, 0, 1, 0], dtype=int32), array([2, 1, 2, 3, 2, 1, 0, 0], dtype=int32), array([2, 1, 0, 1, 2, 1, 0, 0], dtype=int32), array([2, 1, 2, 3], dtype=int32), array([2, 1, 2, 3], dtype=int32), array([2, 1, 0, 0, 0, 0, 1, 2], dtype=int32), a

For reinforcement learning systems, we need to track the actions, as well as the states. Rewards can subsequently be derived from the state, action, and following state

In [54]:
def get_state_actions():

    position = tf.Variable(2, tf.int32)    
    count = tf.Variable(0, tf.int32)
    
    positions = tf.TensorArray(tf.int32, size=MAX_EPISODE_LENGTH)
    positions = positions.write(count, position)
    
    actions = tf.TensorArray(tf.int32, size=MAX_EPISODE_LENGTH-1)

    cond_not_terminal = lambda c, p, ps: tf.not_equal(p, TARGET_POSITION)
    cond_not_max_length = lambda c, p, ps: tf.less(c, MAX_EPISODE_LENGTH-1)
    cond = lambda c, p, a, ps: cond_not_terminal(c, p, ps) & cond_not_max_length(c, p, ps)

    def body(c, p, positions, actions):
        next_c = c+1
        action_values = tf.random_uniform([2], )
        action = choose_action(action_values)
        actions = actions.write(c, action)
        next_p = apply_action(p, action)
        positions = positions.write(next_c, next_p)
        return next_c, next_p, positions, actions

    count, position, positions, actions = tf.while_loop(cond, body, (count, position, positions, actions))
    rng = tf.range(count-1)
    gathered_positions = positions.gather(rng)
    gathered_actions = actions.gather(tf.range(count-2))
    return gathered_positions, gathered_actions


t_states, t_actions = get_state_actions()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    states, actions = sess.run([t_states, t_actions])
    
print(states)
print(actions)

[2 3 4 3 2 1 2 3]
[1 1 0 0 0 1 1]


In [63]:
REWARD = tf.constant(10)
PENALTY = tf.constant(-1)
def get_reward(state, action, next_state):
    at_target = lambda next_state: tf.equal(next_state, TARGET_POSITION)
    return tf.cond(tf.equal(next_state, TARGET_POSITION), lambda: REWARD, lambda: PENALTY)

reward = get_reward(t_states[0], t_actions[0], t_states[1])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(reward)
    
print(result)

-1


In [76]:
rng = tf.range(states.shape[0])
t_current_states = t_states[:-1]
t_next_states = t_states[1:]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    current_states, next_states = sess.run([t_current_states, t_next_states])
    
print(current_states)
print(next_states)

[2 1 0 1 2 1 2]
[1 0 1 2 1 2 3]
