In [None]:
!apt-get install python-opengl
!apt-get install swig cmake libopenmpi-dev zlib1g-dev xvfb x11-utils ffmpeg -qq 
!pip install box2d box2d-kengz pyvirtualdisplay pyglet --quiet

In [None]:
# !git clone https://github.com/kp425/RL.git #clones master
!git clone --branch Env-Branch https://github.com/kp425/RL.git

In [None]:
from RL.utils import timer, normalize
from RL.env import SubprocVecEnv, VecFrameStack, make_env, make_atari_env
from RL.GraphOps import Datapoint, live_plot
from RL.policies.Policies import make_policy


import tensorflow as tf
from tensorflow.keras import layers, Model, Input, Sequential
import tensorflow_probability as tfp

import os
import gym
import numpy as np
from collections import deque, namedtuple
from IPython.display import clear_output
from pyvirtualdisplay import Display
import matplotlib.pyplot as plt
import PIL
%matplotlib inline

display = Display(visible=0, size=(1400, 900)).start()

# Setup an Env (Multi-processing wrapper)

In [None]:
# env_name = "MountainCarContinuous-v0"
env_name = "CartPole-v0"
# env_name = "Pendulum-v0"

num_envs = 8
test_env = gym.make(env_name)
envs = [make_env(env_name) for i in range(num_envs)]
env = SubprocVecEnv(envs)
# save_path = "drive/My Drive/Colab Notebooks/SeriousRL/Trained/{}".format(env_name)
save_path = "RL/Trained/{}".format(env_name)
# save_path = None


#Atari specific

# env_name = "Assault-v0"
# num_envs = 2
# env = gym.make(env_name)
# test_env = gym.make(env_name)

# envs = [make_atari_env(env_name) for i in range(num_envs)]
# env = SubprocVecEnv(envs)
# # env = VecFrameStack(env, 4)

# print(env.reset().shape)

# save_path = "drive/My Drive/Colab Notebooks/SeriousRL/Trained/{}".format(env_name)

# Setting up Function approximator and Policy

In [None]:
def mlp_net(input_shape, n_outputs):
    
    inputs = Input(shape = input_shape)
    
    policy_layers = [layers.Dense(128, activation = tf.nn.relu, name = "policy_layers")]
    p_inputs = inputs
    for p_layer in policy_layers:
        p_inputs = p_layer(p_inputs)

    value_layers = [layers.Dense(128, activation = tf.nn.relu, name = "value_layers")]
    v_inputs = inputs
    for v_layer in value_layers:
        v_inputs = v_layer(v_inputs)
    
    policy_head = layers.Dense(n_outputs, activation = tf.nn.softmax, name = "policy_head")(p_inputs) 
    value_head = layers.Dense(1, activation = tf.nn.tanh, name = "value_head")(v_inputs)
    model = Model(inputs = [inputs], outputs = [policy_head, value_head])

    return model

lr = 0.0001
model = make_policy(env.observation_space, env.action_space, net = mlp_net, save_path=save_path)
model.get_architecture()
optimizer = tf.keras.optimizers.Adam(learning_rate = lr)

Using default net...
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
policy_layers (Dense)           (None, 128)          640         input_3[0][0]                    
__________________________________________________________________________________________________
value_layers (Dense)            (None, 128)          640         input_3[0][0]                    
__________________________________________________________________________________________________
policy_head (Dense)             (None, 2)            258         policy_layers[0][0]              
_______________________________________________________________________

# Compute returns using technique "Generalized advantage Estimation"

In [None]:
def test(n_episodes = 5):
    scores = []
    for episode in range(n_episodes):
        state = test_env.reset()
        done = False
        score = 0
        while not done:
            action, _,_ = model(state)
            next_state, reward, done, _ = test_env.step(action.numpy()[0])
            score += reward
            state = next_state
        scores.append(score)
    print(scores)
    avg_score = sum(scores)/len(scores)
    return avg_score

def compute_returns_with_gae(next_state, rewards, values, masks):
    returns = []
    values = values + [np.squeeze(model(next_state)[2])] # we do this to avoid index out of error
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = (rewards[i] + gamma* values[i+1]* masks[i]) - values[i]
        gae = delta + gamma * smoothing_factor * gae * masks[i]
        returns.insert(0, gae + values[i])
    return returns

# Training PPO

In [None]:
CLIP_RATIOS = 0.2
CLIP_VALUES = 0.0
gamma = 0.99
PPO_STEPS = 128
MINI_BATCH_SIZE = 4
PPO_EPOCHS = 4
smoothing_factor = 0.95


policy_dp = Datapoint("policy_loss")
value_dp = Datapoint("value_loss")
entropy_dp = Datapoint("entropy_loss")
loss_dp = Datapoint("loss_dp")
train_step_counter = 0

policy_dp_avg = Datapoint("policy_loss_avg")
value_dp_avg = Datapoint("value_loss_avg")
entropy_dp_avg = Datapoint("entropy_loss_avg")
loss_dp_avg = Datapoint("loss_dp_avg")
test_dp = Datapoint("test")

class DataPerEpoch:
    policy_loss = 0.0
    entropy_loss = 0.0
    value_loss = 0.0
    loss = 0.0



def make_dataset(*args, n_samples = PPO_STEPS, batch_size = MINI_BATCH_SIZE):
    pack = []
    for i in args:
        pack.append(tf.data.Dataset.from_tensor_slices(i))
    dataset = tf.data.Dataset.zip((*pack))
    dataset = dataset.shuffle(n_samples, reshuffle_each_iteration= True).repeat(PPO_EPOCHS)
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset


def ppo_collect(sample_size = 50, gamma = gamma, smoothing_factor = smoothing_factor):
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []

    state = env.reset()  # Try resetting to random observation
    for _ in range(sample_size):

        action, dist, value = model(state)
        log_prob = dist.log_prob(action)
        next_state, reward, done, _ = env.step(action.numpy())
  
        log_probs.append(log_prob)
        values.append(np.squeeze(value))
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        masks.append(1.0-done)
        state = next_state

    returns = compute_returns_with_gae(next_state, rewards, values, masks)
    
    states = tf.concat(states, axis = 0)
    actions = tf.squeeze(tf.concat(actions, axis = 0))
    log_probs = tf.squeeze(tf.concat(log_probs, axis = 0))
    values = tf.concat(values, axis = 0)
    returns = tf.cast(tf.concat(returns, axis = 0), dtype = tf.float32)
    advantages = tf.math.subtract(returns, values)
  
    n = sample_size * num_envs

    tf.debugging.assert_shapes([(states, (n, * env.observation_space.shape)),
                                (actions, (n,)),
                                (log_probs, (n,)),
                                (values, (n,)),
                                (returns, (n,)),
                                (advantages, (n,))])

    # tf.debugging.assert_shapes([(states, (n, None)),
    #                             (actions, (n,)),
    #                             (log_probs, (n,)),
    #                             (values, (n,)),
    #                             (returns, (n,)),
    #                             (advantages, (n,))])
    
    return states, actions, log_probs, values, returns, advantages
    

def policy_loss_fn(dist, actions, old_log_probs, advantages, clip = CLIP_RATIOS):

    with tf.name_scope(name = "policy_loss"):
        new_log_probs = tf.squeeze(dist.log_prob(actions))
        ratios = tf.exp(new_log_probs - old_log_probs, name = "ratios")        
        surr1 = tf.math.multiply(ratios, advantages, name = "surr1")
        if clip > 0.0:
            clipped_ratios = tf.clip_by_value(ratios, 1.0-clip, 1.0+clip, name = "clipped_ratios")
            surr2 = tf.math.multiply(clipped_ratios, advantages, name = "surr2")
            policy_loss = -tf.reduce_mean(tf.math.minimum(surr1, surr2))
        else:
            policy_loss = -tf.reduce_mean(surr1)

        n = MINI_BATCH_SIZE
        tf.debugging.assert_shapes([(new_log_probs, (n,)),
                                (ratios, (n,)),
                                (surr1, (n,)),
                                (policy_loss,(1,))])
        if clip > 0.0:
            tf.debugging.assert_shapes([(clipped_ratios, (n,)),
                                (surr2, (n,))])

    DataPerEpoch.policy_loss = policy_loss
    return policy_loss


def approx_kl(old_log_probs, new_log_probs):
    return .5 * tf.reduce_mean(tf.square(neglogpac - self.old_neglog_pac_ph))


def value_loss_fn(values, old_values, returns, clip = CLIP_VALUES):
    with tf.name_scope(name = "value_loss"):
        values = tf.squeeze(values, name = "values")
        value_error = tf.math.squared_difference(returns, values)
        if clip > 0.0:
            clipped_values = old_values + tf.clip_by_value(values - old_values, -clip, clip)
            clipped_value_error = tf.math.squared_difference(returns, clipped_values)
            value_loss = tf.reduce_mean(tf.math.maximum(value_error, clipped_value_error))
        else:
            value_loss = tf.reduce_mean(value_error)
    
    n = MINI_BATCH_SIZE
    tf.debugging.assert_shapes([(values, (n,)),
                                (value_error, (n,)),
                                (value_loss, (1,))])
    if clip > 0.0:
        tf.debugging.assert_shapes([(clipped_values, (n,)),
                                    (clipped_value_error, (n,))])
                        
    DataPerEpoch.value_loss = value_loss

    return value_loss

def entropy_loss_fn(dist):
    with tf.name_scope(name = "entropy_loss"):
        entropy_loss = tf.reduce_mean(dist.entropy())
    DataPerEpoch.entropy_loss = entropy_loss
    return entropy_loss


def loss_fn(states, actions, old_log_probs, old_values, returns, advantages, 
            clip_ratios = CLIP_RATIOS, clip_values = CLIP_VALUES):
    _, dist, values = model(states)
    dist = tfp.distributions.BatchReshape(dist, [dist.batch_shape[0]])

    with tf.name_scope(name = "loss"):
        policy_loss = policy_loss_fn(dist, actions, old_log_probs, advantages, clip = CLIP_RATIOS)
        value_loss = value_loss_fn(values, old_values, returns, clip = CLIP_VALUES)
        entropy_loss = entropy_loss_fn(dist)
        loss = policy_loss + 0.5*value_loss - 0.001 * entropy_loss
    DataPerEpoch.loss = loss
    return loss


def train_step(batch):
    with tf.GradientTape() as tape:
        loss = loss_fn(*batch)
    with tf.name_scope(name = "calculating_grads"):
        grads = tape.gradient(loss, model.trainable_variables)
    with tf.name_scope(name = "applying_optimizer"):
        optimizer.apply_gradients(zip(grads, model.trainable_variables))



def train(max_steps = 10000, collect_data= True, report_every = 250, save_every = 5000):

    train_step_counter = 1
    threshold_reward = 195.0
    
    while train_step_counter < max_steps:
        
        data = ppo_collect(sample_size = PPO_STEPS)
        dataset = make_dataset(data)
        for batch in dataset:
            train_step(batch)
            if collect_data == True:
                policy_dp.collect(train_step_counter, DataPerEpoch.policy_loss.numpy())
                value_dp.collect(train_step_counter, DataPerEpoch.value_loss.numpy())
                entropy_dp.collect(train_step_counter, DataPerEpoch.entropy_loss.numpy())
                loss_dp.collect(train_step_counter, DataPerEpoch.loss.numpy())
            
            if train_step_counter % report_every == 0:
                from_index = ((train_step_counter // report_every) - 1)*report_every
                policy_dp_avg.collect(train_step_counter, policy_dp.avg_y(from_index = from_index))
                value_dp_avg.collect(train_step_counter, value_dp.avg_y(from_index = from_index))
                entropy_dp_avg.collect(train_step_counter, entropy_dp.avg_y(from_index = from_index))
                loss_dp_avg.collect(train_step_counter, loss_dp.avg_y(from_index = from_index))
                # test_reward = test()
                test_reward = test()
                test_dp.collect(train_step_counter, test_reward)
                live_plot([policy_dp_avg, value_dp_avg, entropy_dp_avg, loss_dp_avg, test_dp])
                if test_reward > threshold_reward: 
                    print("Saving....")
                    model.save()
                    return
        
            if train_step_counter % save_every == 0:
                print("Saving....")
                model.save()    
            train_step_counter += 1

        

train(max_steps = 25000)

# Makes Video using the trained policy

In [None]:
# import os
# os.system("Xvfb :1 -screen 0 1024x768x24 &")
# os.environ['DISPLAY'] = ':1'

import imageio
import base64
import IPython

def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)


def create_policy_eval_video(agent, env_name, directory, filename, num_episodes=1, fps=30):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    video_env = gym.make(env_name)
    filename = os.path.join(save_path, filename + ".mp4")
    with imageio.get_writer(filename, fps=fps) as video:
        for _ in range(num_episodes):
            state = video_env.reset()
            video.append_data(video_env.render(mode='rgb_array'))
            done = False
            while not done:
                action,_,_ = model(state)
                new_state,_,done,_ = video_env.step(action.numpy()[0])
                video.append_data(video_env.render(mode='rgb_array'))
                state = new_state
    return embed_mp4(filename)

# env_name = "CartPole-v0"
create_policy_eval_video(model, env_name, save_path, "trained-agent")