## First Look at Reinforcement Learning using OpenAI Gym

In [2]:
import gym
gym.envs.registry.all()
env = gym.make("CartPole-v1")
# We will Look at CartPole

In [3]:
## Always reset first!
obs = env.reset()
obs

array([-0.02453076,  0.00396951, -0.02342471,  0.03943438])

In [3]:
# Tells what kind of actions are allowed
env.action_space

Discrete(2)

In [None]:
# Helpful to see if rendering works
# import gym

# env = gym.make('CartPole-v0')
# env.reset()

# for _ in range(1000):
#     env.render()
#     env.step(env.action_space.sample())
    
# env.close()

In [4]:
action = 1
obs, reward, done, info = env.step(action)
obs 
#observation, reward, game over, helpful for debug

array([-0.02445137,  0.1994194 , -0.02263602, -0.26054626])

In [5]:
reward

1.0

In [6]:
# Try a simple go left if leaned left, go right if leaned right
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(200):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [7]:
import numpy as np
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(41.27, 8.646681444346148, 25.0, 68.0)

## Let's try Neural Network

In [8]:
import tensorflow as tf
from tensorflow import keras

In [10]:
n_inputs = 4 # this is env.observation_space.shape[0]
# Model with 1 hidden layer, 1 output layer with 1 output
model = keras.models.Sequential([
    keras.layers.Dense(5, activation="elu", input_shape=[n_inputs]),
    keras.layers.Dense(1, activation="sigmoid")
])

In [12]:
# This will play one episode
def render_policy_net(model, n_max_steps=200, seed=42):
    frames = []
    env = gym.make("CartPole-v1")
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()
    for step in range(n_max_steps):
        frames.append(env.render(mode="rgb_array"))
        left_proba = model.predict(obs.reshape(1, -1))
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)
        if done:
            break
    env.close()
    return frames

In [11]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0,0].numpy()))
    return obs, reward, done, grads
                                            

In [13]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env,obs,model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
            all_rewards.append(current_rewards)
            all_grads.append(current_grads)
        return all_rewards, all_grads

In [14]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) -2, -1, -1):
        discounted[step] += discounted[step+1]*discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discoutned_rewards in all_discounted_rewards]

### Checking it works


In [18]:
discounted_rewards = discount_rewards([10,0,-50], discount_factor=0.8)

In [19]:
discount_and_normalize_rewards([[10,0,-50],[10,20]],discount_factor=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([-0.28435071, -0.86597718, -1.18910299])]

In [42]:
n_iterations = 151
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95

optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [43]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    all_mean_grads=[]
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean([final_reward*all_grads[episode_index][step][var_index]
                                    for episode_index, final_rewards in enumerate(all_final_rewards)
                                        for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))


In [44]:
env = gym.make("CartPole-v1")
env.seed(42);

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_factor)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()

Iteration: 150, mean rewards: 140.6

In [39]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

In [48]:
# To plot animation
import matplotlib.pyplot as plt
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [49]:
frames = render_policy_net(model)
plot_animation(frames)