In [1]:
import warnings
warnings.filterwarnings('ignore')

import gym
env = gym.make("CartPole-v0")
obs=env.reset() #lets call it init
obs #check the values
env.render() #render the image

True

In [2]:
obs=env.reset()

In [3]:
env.action_space #Display the number of actions possible

Discrete(2)

In [4]:
action = 0 
obs, reward, done, info = env.step(action)
print("Observation : ", obs)
print("Reward : ", reward)
print("Done ? ", done)
print("Info ",info)
env.render()

Observation :  [ 0.00626124 -0.21144937  0.01230998  0.32373791]
Reward :  1.0
Done ?  False
Info  {}


False

In [5]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals=[]
for episode in range(5):
    episode_rewards=0
    obs=env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, rewards, done, info = env.step(action)
        episode_rewards += rewards
        if done:
            break
        totals.append(episode_rewards)
        env.render()

In [6]:
import numpy as np
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))

23.073529411764707 14.653497383985444 1.0 52.0


In [7]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [8]:
#1. Let's specify the neural network architecture
n_inputs = 4 #this will always be equal to the observation shape obs.shape[0]
n_hidden = 4 #it's a simple task, we dont need more hidden neurons
n_ouputs = 1 #only ouputs probablity of accelearting left
initializer = tf.contrib.layers.variance_scaling_initializer()

In [9]:
#2 Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)
logits = fully_connected(hidden, n_ouputs, activation_fn=None, weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
#3 Select random action based on the estimated probablities
p_left_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_right), num_samples=1)

init = tf.global_variables_initializer()

Instructions for updating:
Use tf.random.categorical instead.


If we knew what the best action was at each step, then we could train the Neural Network as usual, by minimizing the cross entropy loss between the estimated probablity and the targeted probablity (Regular Supervised Learning). However, in Reinforcement Learning, the only guidance that is given to the agent is in the form of **rewards** and rewards are typically sparse and delayed.

## Credit Assignment Problem
When a agent gets reward, its hard for it to know which actions should get credit (or blamed) for it. Think of a dog that gets rewarded hours after if behaved well, will it understand what it is rewarded for ?

Actions are evaluated based on the rewards that are obtained post the action is taken. The

Sometimes we are concerned about the immediate rewards than future rewards or vice-versa (like short time investments or long time investments). Based on what you want the agent to optimize **discount rate r** is applied at each step, if the _r_ is close to 0, then the future reward won't count much compared to the immediate rewards. Alternatively, if the value of _r_ is close to 1, then rewards far into the future will count as much as the immediate reward.

## Policy Gradients
Policy Gradients algorithms optimize the parameters of a policy by following the gradient toward higher rewards. 

1. Let the Neural Network policy play the game several times and at each step compute the gradients that would make the chosen action even more likely, but don't apply these gradients yet.

2. Once we have run several episodes, compute each action value (using the discount factor)

3. If the action value is positive (action was good), we want to make the same action. If the value is negative (action was bad), we would like to make the less of this action. There are two ways of doing this; 
<br>   1. Discarding all the negative values
<br>   2. Making the opposite action to what was performed during the episode i.e multiply the gradient vector by the corresponding action's score.

4. Compute the mean of all the resulting gradient vectors and use it perform Gradient Descent step.

In [11]:
y = 1. - tf.to_float(action)

Instructions for updating:
Use tf.cast instead.


In [12]:
learning_rate = 0.01

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits( labels = y, logits = logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_vars = optimizer.compute_gradients(cross_entropy)

In [13]:
gradients = [grad for grad, variable in grads_vars]

In [14]:
gradient_placeholders = []
grads_vars_feed = []
for grad,variable in grads_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_vars_feed.append((gradient_placeholder, variable))

training_op = optimizer.apply_gradients(grads_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [15]:
def discounted_rewards(rewards, discount_rate):
    discounted_reward = np.zeros(len(rewards))
    cummulative_reward = 0
    for step in reversed(range(len(rewards))):
        cummulative_reward = rewards[step] + cummulative_reward * discount_rate
        discounted_reward[step] = cummulative_reward
    return discounted_reward

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discounted_rewards(reward, discount_rate) for reward in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    rewards_mean = np.mean(flat_rewards)
    rewards_std = np.std(flat_rewards)
    return [(discounted_reward-rewards_mean)/rewards_std for discounted_reward in all_discounted_rewards]


# discounted_rewards([10,0,-50], discount_rate=0.8)
# discount_and_normalize_rewards([[10,0,-50], [10,20]],discount_rate=0.8)

In [None]:
n_iterations = 250
n_max_steps = 1000
n_games_per_update = 10
save_iterations = 10
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        all_rewards = []
        all_gradients = []
        for games in range(n_games_per_update):
            current_rewards = []
            current_gradients = []
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run(
                        [action, gradients], feed_dict = {X:obs.reshape(1, n_inputs)})
                print(action_val)
                exit(0)
                obs, reward, done, info = env.step(action_val[0][0])
                current_rewards.append(reward)
                current_gradients.append(gradients_val)
                if done:
                    break
                all_rewards.append(current_rewards)
                all_gradients.append(current_gradients)
                
                #At this point we have run the policy for 10 episodes
                
                all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
                feed_dict = {}
                for var_index, grad_placeholder in enumerate(gradient_placeholders):
                    mean_gradients = np.mean(
                        [reward * all_gradients[game_index][step][var_index]
                        for game_index,rewards in enumerate(all_rewards)
                        for step,reward in enumerate(rewards)],
                        axis = 0 )
                    feed_dict[grad_placeholder] = mean_gradients
                sess.run(training_op, feed_dict=feed_dict)
                if iteration % save_iterations == 0:
                    saver.save(sess, "./my_policy_net_pg.ckpt")
                    

[[0]]
[[2]]


AssertionError: 2 (<class 'numpy.int64'>) invalid