In [3]:
import numpy as np
import inspect
import functools
import itertools

import tensorflow as tf
import gym

import config as config
from utils import trajectory_dx, partition_rewards, discount_rewards, discount_check
from agent import PGAgent, RandomAgent
from layers import MLP
from train import run_trajectory, rollout
import utils as utils
import mcts
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#Gym setup
env = gym.make("Pong-v0")

#TF Setup
g = tf.Graph()
sess = tf.Session(graph=g)

#RL Agent Setup
action_params = {
                "input_dim": config.state_dim,
                "hidden_dim": config.hidden_dim,
                "output_dim": config.output_dim
                }
optimizer_params = {
                    "learning_rate": .01
                   }
epsilon = 1 #Exploration parameter --> 0 corresponds to always explore (random policy), 1 corresponds to stochastic sampling according to policy neta
agent = PGAgent(g, sess, config.state_dim, MLP, action_params, optimizer_params, epsilon=epsilon)
test_agent = RandomAgent()


[2016-09-09 17:52:00,519] Making new env: Pong-v0


In [5]:
env.action_space

Discrete(6)

num_eps = 3
_ = agent.rollout(num_eps,env)

In [135]:
sampler = agent._in_order_sampler(100,50)

In [159]:
np.sum(len(eps) for eps in agent._reward_buffer)

4036

In [167]:
experiences = agent.sample_experiences(4036,False)

In [172]:
_, a, dr = experiences.next()

In [173]:
dr[:10]

array([-0.87752102, -0.79361428, -0.64910263, -0.95099005, -0.79361428,
       -0.66228204, -0.59895601, -0.80163059, -0.81790694, -0.9801    ])

In [29]:
batches = zip(*list(agent.batch_iter()))

In [30]:
s, a, dr = [np.concatenate(eps) for eps in batches]


In [22]:
print s.shape, a.shape, dr.shape

(5107, 6400) (5107,) (5107,)


In [80]:
import random
idx = np.random.permutation(len(s))
iter_idx = itertools.cycle(idx)
def sampler(iter_idx): 
    while True: 
        yield list(itertools.islice(iter_idx,5))
        

In [81]:
sample_iter = sampler(iter_idx)

In [92]:
sample_iter.next()

[2935, 1651, 497, 1704, 4266]

In [65]:
def random_idx(iter_idx):
    while True:
        yield list(itertools.islice(iter_idx,5))

In [66]:
sampler = random_idx(iter_idx)

In [75]:
sampler.next()

[1250, 3536, 522, 1031, 60]

In [20]:
s_cat = np.concatenate(s_batch)
s_cat.shape

(5107, 6400)

In [198]:
#Extract gradient op and variables
dW1, W1 = agent.gradients[0]
dW2, W2 = agent.gradients[1]

In [199]:
#Check gradient accumulation
#Sum over each batch manually, check with single call to agent.train
gradient_buffer = {"W1": np.zeros(W1.get_shape().as_list(),dtype='float64'), "W2": np.zeros(W2.get_shape().as_list(),dtype='float64')}
gradients_W1, gradients_W2 = [], []

def prep(batch):
    s, a, r = batch
    labels = a - 2
    one_hot_labels = utils.encode_one_hot(labels)
    discounted_r = agent.discount_rewards(r, config.gamma)
    feed_dict = {agent.states: s, agent.labels: one_hot_labels, agent.discounted_r: discounted_r}
    return feed_dict



In [204]:
%%time
with sess.as_default():
    #Accumulate over episodes
    for batch in batches:
        feed_dict = prep(batch)
        gradients_W1.append(dW1.eval(feed_dict))
        gradients_W2.append(dW2.eval(feed_dict))

CPU times: user 5.31 s, sys: 20 ms, total: 5.33 s
Wall time: 1.78 s


In [203]:
%%time
with sess.as_default():
    #Aggregate episodes into single batch
    unrolled = zip(*batches)
    batch = [np.concatenate(eps) for eps in unrolled]
    feed_dict = prep(batch)
    agg_grad_W2 = dW2.eval(feed_dict)
    agg_grad_W1 = dW1.eval(feed_dict)

CPU times: user 5.51 s, sys: 640 ms, total: 6.15 s
Wall time: 3.25 s


In [200]:
def norm(m1, m2):
    return np.sqrt(np.sum((m1 - m2)**2))

In [201]:
for grad in gradients_W2:
    gradient_buffer["W2"] += grad
print np.allclose(agg_grad_W2, gradient_buffer["W2"])
print norm(agg_grad_W2, gradient_buffer["W2"])

False
4.51842061644e-05


In [202]:
for grad in gradients_W1:
    gradient_buffer["W1"] += grad
print np.allclose(agg_grad_W1, gradient_buffer["W1"])
print norm(agg_grad_W1, gradient_buffer["W1"])

False
3.68180478889e-05


In [188]:
np.sqrt(np.sum((gradient_buffer["W1"] - agg_grad_W1)**2))

1.1928621432103223e-05

In [186]:
agg_grad_W1[-5:]

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        

In [156]:
#Concat episodes to create single batch
import itertools

batches = list(agent.batch_iter())
unrolled = zip(*batches)
batch = [np.concatenate(eps) for eps in unrolled]

In [158]:
for b in batch:
    print b.shape

(2287, 6400)
(2287,)
(2287,)


In [149]:
np.concatenate(unrolled[0][0],unrolled[0][1])

TypeError: only length-1 arrays can be converted to Python scalars

In [154]:
unrolled[0][0].shape
unrolled[0][1].shape
np.concatenate(unrolled[0])

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [106]:
s, a, r = batches.next()
print utils.trajectory_dx(s, a, r)

labels = a - 2
one_hot_labels = utils.encode_one_hot(labels)
discounted_r = agent.discount_rewards(r, config.gamma)

feed_dict = {agent.states: s, agent.labels: one_hot_labels, agent.discounted_r: discounted_r}

Total actions: 1011
Total reward: -21.0
# of Up moves: 184
# of Down moves: 827
# of positive rewards 0
# of negative rewards 21
None


In [86]:
action_probs = [.9,.1]
np.sum(np.random.choice(agent.VALID_ACTIONS, size=100, p=action_probs) == 2)

91