[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kinalmehta/Reinforcement-Learning-Notebooks/blob/master/Policy%20Gradients/Synchronous_A2C_torch.ipynb)

### Basic Setup step in **Colab**

In [None]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!apt-get install libopenmpi-dev > /dev/null 2>&1
!apt-get install zlib1g-dev > /dev/null 2>&1

!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari,box2d,classic_control] > /dev/null 2>&1 # change to gym[atari,box2d,classic_control]
!pip install stable-baselines[mpi] > /dev/null 2>&1

In [None]:
# %tensorflow_version 2.x
%tensorflow_version 1.x


#### Adding a virtual display for rendering

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

#### Uncomment below to connect to drive to save model and video outputs

In [None]:

# from google.colab import drive
# drive.mount('/content/gdrive')

# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'
# import os
# os.chdir(root_path)

### Standard imports and notebook setup

In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [None]:

from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv, VecVideoRecorder


In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[-1]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)

        # you can add "loop" after autoplay to keep the video looping after it ends
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                     controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env(env):
    env = VecVideoRecorder(env, './video')
    return env

## ADD Your Algorithm Below

In [None]:

def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    return _init


In [None]:
num_cpu = 6  # Number of processes to use

env_list = ["CartPole-v0", "LunarLander-v2", "MsPacman-ram-v0", "CartPole-v0", "MountainCar-v0", "Breakout-ram-v4", "Acrobot-v1"]

env_to_use = env_list[1]


In [None]:

# Create the vectorized environment
env = SubprocVecEnv([make_env(env_to_use, i) for i in range(num_cpu)])

s0 = env.reset()
print(s0.shape)
actions = [env.action_space.sample() for i in range(num_cpu)]
print(actions)
env.step(actions)

for i in range(100):
    actions = [env.action_space.sample() for i in range(num_cpu)]
    obs, ret, done, info = env.step(actions)
    print(obs, done, info)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ACNet(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(ACNet, self).__init__()
        torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.actor = nn.Linear(fc2_units, action_size)
        self.critic = nn.Linear(fc2_units, 1)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.actor(x), torch.squeeze(self.critic(x), dim=-1)


class PGLoss(nn.Module):
    def forward(self, policy, obs, act, rew_wt):
        logp = policy.log_prob(act)
        return -(logp * rew_wt).mean()


In [None]:
import torch
import torch.optim as optim

from torch.distributions.categorical import Categorical

from collections import defaultdict

class Agent:
    def __init__(self, env, batch_size):
        self.env = env
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n

        self.policy_network = ACNet(self.state_size, self.action_size, 4)
        self.actor_loss = PGLoss()
        self.critic_loss = nn.MSELoss()
        self.batch_size = batch_size

        self.train_stats = defaultdict(list)
        self.gamma=0.9

    def get_net_op(self, obs):
        actor, critic = self.policy_network(torch.as_tensor(obs, dtype=torch.float32))
        return Categorical(logits=actor), critic
    def get_policy(self, obs):
        net_op, _ = self.policy_network(torch.as_tensor(obs, dtype=torch.float32))
        return Categorical(logits=net_op)
    def get_action(self, policy):
        return policy.sample().item()

    def train(self, epochs):
        optimizer = optim.Adam(self.policy_network.parameters(), lr=1e-2)

        cur_obs = self.env.reset()

        cur_policy, cur_ret = self.get_net_op(cur_obs)

        total_rewards = 0
        total_episodes = 0
        batch_rewards = np.zeros(num_cpu)

        for i in range(epochs):

            cur_actions = [env.action_space.sample() for i in range(num_cpu)]
            cur_action = cur_policy.sample()
            # print("[INFO 1] action/return shape", cur_action.shape, cur_ret.shape)
            # next_obs, cur_reward, done, _ = self.env.step(list(cur_actions))
            # print("a", cur_actions, cur_action)
            next_obs, cur_reward, done, _ = self.env.step(list(cur_action.numpy()))
            next_policy, next_ret = self.get_net_op(next_obs)
            # print("[INFO 2] reward shape", cur_reward.shape)
            target_ret = torch.tensor(cur_reward, dtype=torch.float32) + self.gamma*next_ret.detach()*(1-torch.tensor(done, dtype=torch.float32))

            optimizer.zero_grad()
            
            actor_loss_val = self.actor_loss(cur_policy, cur_obs, cur_action, target_ret-cur_ret.detach())
            critic_loss_val = self.critic_loss(cur_ret, target_ret)
            entropy_loss_val = cur_policy.entropy().mean()
            loss = actor_loss_val + critic_loss_val*0.05 + entropy_loss_val*0.01

            loss.backward()
            optimizer.step()

            cur_policy = next_policy
            cur_ret = next_ret

            if np.any(done):
                indxes = np.squeeze(np.argwhere(done), axis=-1)
                # print(indxes)
                total_episodes += len(indxes)
                total_rewards += np.sum(batch_rewards[indxes])
                batch_rewards[indxes] = 0
                # for i in indxes:
                #     total_episodes += 1
                #     total_rewards += batch_rewards[i]
                #     batch_rewards[i] = 0

            else:
                batch_rewards += cur_reward

            if (i+1)%5000==0:
                self.train_stats["actor_loss"] += [actor_loss_val.item()]
                self.train_stats["critic_loss"] += [critic_loss_val.item()]
                self.train_stats["total_loss"] += [loss.item()]
                self.train_stats["returns"] += [total_rewards/(total_episodes+1e-8)]
                print("Epoch:", i, actor_loss_val.item(), critic_loss_val.item(), entropy_loss_val.item(), loss.item(), self.train_stats["returns"][-1])
    
    def plot_train_stats(self):
        if len(self.train_stats)==0:
            print("first train to print train stats")
        for i in self.train_stats:
            plt.plot(self.train_stats[i])
            plt.xlabel("Epoch")
            plt.ylabel(i)
            plt.show()
        return


In [None]:

print("GPU available:", torch.cuda.is_available())

env = SubprocVecEnv([make_env(env_to_use, i) for i in range(num_cpu)])

agent = Agent(env, batch_size=5000)
agent.train(20*5000)



In [None]:
agent.plot_train_stats()

In [None]:
# watch the trained agent
env = wrap_env(gym.make(env_to_use))
state = env.reset()
done=False
while not done:
    policy, _ = agent.get_net_op(state)
    action = agent.get_action(policy)
    env.render()
    state, reward, done, _ = env.step(action)
    if done:
        break
        
env.close()
show_video()