In [None]:
# !pip install gymnasium
import gymnasium as gym
print(gym.__version__)
import tensorflow as tf
import numpy as np

# Creating Environment Using [GYM](https://github.com/Farama-Foundation/Gymnasium) - CartPole Game


In [None]:
# environment is defined by 4 variables here [horizontal pos, horizontal velocity, angle of pole, angular velocity]
env = gym.make('CartPole-v1')

# Reset the environment to default beginning
observation = env.reset()
print(observation)

(array([-0.02135036, -0.04951756, -0.03659555,  0.03179171], dtype=float32), {})


# Performing some random action

In [None]:
# Performing 1 Random Action and seeing the change in the environment via observation value change
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
print(observation, reward, terminated, truncated , info)

[ 0.00472824  0.3598044  -0.06903811 -0.6166674 ] 1.0 False False {}


In [None]:
# Performing 1000 random actions and visulizing each action using rendering in gym
for t in range(1000):
    env.render() # this wont work in google colab, you will have to run this on local as a python file and not a notebook file

    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(observation, reward, terminated, truncated , info)

# Perform some specific action using manually defined policy

In [None]:
for t in range(1000):
    env.render()
    pole_ang = observation[2]

    if pole_ang > 0: # means pole is falling to right
        action = 1 # hence move the cart to right to balance the pole
    else: # means pole is falling to left
        action = 0 # hence move the cart to left to balance the pole

    # Perform Action
    observation, reward, terminated, truncated, info = env.step(action)
    # print(observation, reward, terminated, truncated , info)

  logger.warn(


# Learning policy using FFNN (using only the current state) and hence perform action

In [None]:
num_inputs = 4 #Observation Space has 4 inputs
num_hidden = 4 # only one hidden layer
num_outputs = 1 #Outputs the probability it should go left
learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()

In [None]:
# defining the FFNN architecture
X = tf.placeholder(tf.float32, shape=[None,num_inputs])
hidden_layer_one = tf.layers.dense(X,num_hidden,activation=tf.nn.relu,kernel_initializer=initializer) # this is actually input layer and not hidden layer
hidden_layer_two = tf.layers.dense(hidden_layer_one,num_hidden,activation=tf.nn.relu,kernel_initializer=initializer) # hidden layer 1
output_layer = tf.layers.dense(hidden_layer_one,num_outputs,activation=tf.nn.sigmoid,kernel_initializer=initializer) # Output Layer - Probability to go left
probabilties = tf.concat(axis=1, values=[output_layer, 1 - output_layer]) # [ Prob to go left , Prob to go right]
action = tf.multinomial(probabilties, num_samples=1) # Sample 1 randomly based on probabilities
init = tf.global_variables_initializer()

In [None]:
# trianing the above architecture
saver = tf.train.Saver()
env = gym.make("CartPole-v1")

with tf.Session() as sess:
    init.run()

    # we will run the game for 50 different times
    for i_episode in range(50):
        obs = env.reset() # new game start is defined by resetting the environment

        # for each gameplay predicting action for 500 times i.e. playing the game by taking action 500 times
        for step in range(500):
            # env.render()
            action_val = action.eval(feed_dict={X: obs.reshape(1, num_inputs)}) #training using the above network and by inputing the observations
            observation, reward, terminated, truncated, info = env.step(action_val[0][0]) #performing the action and the getting the new observation

env.close()

# Learning policy using FFNN (using all the previous states) and hence perform action

## Training the network

In [None]:
X = tf.placeholder(tf.float32, shape=[None, num_inputs])

hidden_layer = tf.layers.dense(X, num_hidden, activation=tf.nn.elu, kernel_initializer=initializer) # this is actually input layer
logits = tf.layers.dense(hidden_layer, num_outputs) # output layer
outputs = tf.nn.sigmoid(logits)  # probability of action 0 (left)

probabilties = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial( probabilties, num_samples=1)
y = 1. - tf.to_float(action) # Convert from Tensor to number for network training


cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients_and_variables = optimizer.compute_gradients(cross_entropy)

gradients = []
gradient_placeholders = []
grads_and_vars_feed = []

for gradient, variable in gradients_and_variables:
    gradients.append(gradient)
    gradient_placeholder = tf.placeholder(tf.float32, shape=gradient.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))


training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

######################################
#### REWARD FUNCTIONs ################
####################################
# CHECK OUT: https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724

def helper_discount_rewards(rewards, discount_rate):
    '''
    Takes in rewards and applies discount rate
    '''
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    '''
    Takes in all rewards, applies helper_discount function and then normalizes
    using mean and std.
    '''
    all_discounted_rewards = []
    for rewards in all_rewards:
        all_discounted_rewards.append(helper_discount_rewards(rewards,discount_rate))

    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

########################################
#### TRAINING SESSION #################
######################################

env = gym.make("CartPole-v0")
with tf.Session() as sess:
    sess.run(init)


    for iteration in range(250):
        all_rewards = []
        all_gradients = []

        # Play 10 game rounds
        for game in range(10):
            current_rewards = []
            current_gradients = []
            observations = env.reset()

            # Only allow 1000 amount of steps in game
            for step in range(1000):
                action_val, gradients_val = sess.run([action, gradients], feed_dict={X: observations.reshape(1, num_inputs)}) # Get Actions and Gradients
                observations, reward, done, info = env.step(action_val[0][0]) # Perform Action
                current_rewards.append(reward) # Get Current Rewards and Gradients
                current_gradients.append(gradients_val)
                if done: # means game ended
                    break

            # Append to list of all rewards
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)

        all_rewards = discount_and_normalize_rewards(all_rewards,0.95)
        feed_dict = {}


        for var_index, gradient_placeholder in enumerate(gradient_placeholders):
            mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                      for game_index, rewards in enumerate(all_rewards)
                                          for step, reward in enumerate(rewards)], axis=0)
            feed_dict[gradient_placeholder] = mean_gradients

        sess.run(training_op, feed_dict=feed_dict)

    print('SAVING GRAPH AND SESSION')
    meta_graph_def = tf.train.export_meta_graph(filename='/models/my-650-step-model.meta')
    saver.save(sess, '/models/my-650-step-model')

## Predicting using trained network

In [None]:
#############################################
### RUN TRAINED MODEL ON ENVIRONMENT #######
###########################################

env = gym.make('CartPole-v0')

observations = env.reset()
with tf.Session() as sess:
    new_saver = tf.train.import_meta_graph('/models/my-650-step-model.meta')
    new_saver.restore(sess,'/models/my-650-step-model')

    for x in range(500):
        env.render()
        action_val, gradients_val = sess.run([action, gradients], feed_dict={X: observations.reshape(1, num_inputs)})
        observations, reward, done, info = env.step(action_val[0][0])