# REINFORCE
Now that we know how to use neural networks in Tensorflow, lets implement our first RL algorithm to solve an RL environment. 

## LunarLander
The environment we will use is the OpenAI Lunar Lander environment. In this environment the agent must safely land a spaceship on the moon by carefully controlling the spaceship's thrusters. Below is a screenshot of the environment. Lets quickly evaluate a random agent on the environment to see how well it does.

![Lunar Lander](../images/lunarlander.png)

In [1]:
!pip install gym
!pip install box2d.py



In [2]:
import gym
import numpy as np

environment = gym.make("LunarLander-v2")
returns = []
for i in range(100):
    episode_return = 0
    observation = environment.reset()
    done = False
    while not done:
        action = environment.action_space.sample()

        next_observation, reward, done, info = environment.step(action)

        episode_return += reward

        observation = next_observation

    returns.append(episode_return)

print("Average Return for Random Agent:", np.average(returns))
 

Average Return for Random Agent: -170.51265698089665


So, a random agent on Lunar Lander gets a score oof around -200. A winning score is +200. To create a winning agent we will implement REINFORCE, the simplest policy gradient RL algorithm.

## REINFORCE


In [1]:
!pip install tensorflow
!pip install tensorflow_probability

Collecting tensorflow_probability
  Downloading tensorflow_probability-0.13.0-py2.py3-none-any.whl (5.4 MB)
[K     |████████████████████████████████| 5.4 MB 8.2 MB/s 
[?25hCollecting dm-tree
  Using cached dm_tree-0.1.6-cp37-cp37m-manylinux_2_24_x86_64.whl (93 kB)
Installing collected packages: dm-tree, tensorflow-probability
Successfully installed dm-tree-0.1.6 tensorflow-probability-0.13.0


In [3]:
import numpy as np

def reward_to_go(rewards, gamma):
    rewards = np.array(rewards)
    rewtg = np.zeros_like(rewards, dtype='float32')
    T = len(rewards) - 1
    rewtg[T] = rewards[T]
    for t in range(T-1, -1, -1):
        rewtg[t] = rewards[t] + gamma * rewtg[t+1]
    return rewtg

In [4]:
import tensorflow as tf
import tensorflow_probability as tfp

class REINFORCEAgent():

    def __init__(self, act_dim, hidden_size=100, learning_rate=1e-3, gamma=0.9):
        self.policy_network = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(hidden_size),
                tf.keras.layers.ReLU(),
                tf.keras.layers.Dense(act_dim),
            ]
        )
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.gamma = gamma
        self.replay_buffer = []

    def choose_action(self, observation):
        observation = tf.convert_to_tensor(observation)
        observation = tf.expand_dims(observation, axis=0)
        logits = self.policy_network(observation)
        policy = tfp.distributions.Categorical(logits=logits)
        action = policy.sample()
        return action.numpy()[0]
        

    def store(self, observation, action, reward, next_observation):
        experience_tuple = (observation, action, reward, next_observation)
        self.replay_buffer.append(experience_tuple)

    def learn(self):
        observations, actions, rewards, next_observations = zip(*self.replay_buffer)

        observations = tf.convert_to_tensor(observations)
        actions = tf.convert_to_tensor(actions)

        rewtg = reward_to_go(rewards, self.gamma)
        rewtg = tf.convert_to_tensor(rewtg)

        with tf.GradientTape() as tape:
            logits = self.policy_network(observations)
            policy = tfp.distributions.Categorical(logits=logits)
            log_probs = policy.log_prob(actions)

            loss = -tf.reduce_sum(log_probs * rewtg)

        variables = self.policy_network.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

        self.replay_buffer = []


2021-08-02 15:43:42.023670: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-02 15:43:42.023702: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
def run_episode(environment, agent):
    episode_return = 0
    done = False
    observation = environment.reset()
    while not done:
        action = agent.choose_action(observation)

        next_observation, reward, done, info = environment.step(action)

        agent.store(observation, action, reward, next_observation)

        observation = next_observation

        episode_return += reward
    
    return episode_return


In [6]:
import gym

environment = gym.make("LunarLander-v2")

act_dim = environment.action_space.n

agent = REINFORCEAgent(act_dim)

episode_return = run_episode(environment, agent)
print("Episode Return:", episode_return)

2021-08-02 15:43:52.937823: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-08-02 15:43:52.937855: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-08-02 15:43:52.937877: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2021-08-02 15:43:52.938185: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Episode Return: -507.0328812567266


In [7]:
import datetime

class Logger():

    def __init__(self, logdir="./logs/"):
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        logdir = logdir + current_time

        self.summary_writer = tf.summary.create_file_writer(logdir)

    def write(self, step, logs):
        """Write logs to tensorboard.

        Args:
            step (Int): Training step of the logs.
            logs (Dict[str, float]): Dictionary of logs to be written to tensorboard.
        """
        with self.summary_writer.as_default():
            for key, value in logs.items():
                tf.summary.scalar(key, value, step=step)




In [10]:
def train(environment, agent, logger, num_iter=5000):
    scores = []
    for i in range(num_iter):
        score = run_episode(environment, agent)
        agent.learn()
        scores.append(score)

        logger.write(step=i, logs={"return": score})

In [13]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 30024), started 0:00:18 ago. (Use '!kill 30024' to kill it.)

In [None]:
%tensorboard --logdir logs/

In [14]:
logger = Logger()

train(environment, agent, logger)