In [1]:
# Set up the Open AI Gym
import gym
env = gym.make("CartPole-v0")

obs = env.reset()
print(obs)
# env.render()

[-0.02961324 -0.03014861 -0.00320507  0.01430835]


In [2]:
# Ask the environment what actions are possible (what is the action space)?
env.action_space

Discrete(2)

In [3]:
# Discreate(2) means the possible actions are integers 0 and 1 which, in this case, represent
#  accelerating left (0) or right (1)

# Since the poll is leaning toward the right, let's accelerate the cart toward the right
#  by using the step() method which executes the given action
action = 1
obs, reward, done, info = env.step(action)
print(obs, reward, done, info)
# env.render()

[-0.03021621  0.16501916 -0.0029189  -0.27938409] 1.0 False {}


In [4]:
# Let's hardcode a simple policy that will
#  - Accelerate left when the pole is leaning toward the left and
#  - Accelerate right when the pole is leaning toward the right

def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)
    

import numpy as np
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(41.982, 8.492448174702039, 25.0, 68.0)

In [5]:
# Tensorflow time!!!
import tensorflow as tf

# 1. Specify the neural network architecture
n_inputs = 4 #  == env.observation_space.shape[0]
n_hidden = 4 #  it's a simple task, we don't need more hidden neurons
n_outputs = 1 #  only outputs the probability of accelerating left
initializer = tf.contrib.layers.variance_scaling_initializer()

# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs, kernel_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

init = tf.global_variables_initializer()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use `tf.random.categorical` instead.


In [None]:
# Let's implement a common variant of the REINFORCE algoriths, which is a 
#  popular class of PG algorithms

y = 1 - tf.to_float(action)

# Now that we have a target probability, we can define the cost function (cross entropy)
#  and compute the gradients

learning_rate = 0.01

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)

optimizer = tf.train.AdamOptimizer(learning_rate)

grads_and_vars = optimizier.compute_gradients(cross_entropy)

gradients = [grad for grad, variable in grads_and_variables]