<a href="https://colab.research.google.com/github/aditya-shriwastava/DeepRL_Experimentation/blob/master/PolicyGradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup the project in Google Colab

In [1]:
#Mount the project in Google Colab

GUEST = False
PROJECT = 'DeepRL_Experimentation'

import os
if GUEST:
  !git clone https://github.com/aditya-shriwastava/DeepRL_Experimentation.git
else:
  # Mount Google Drive
  from google.colab import drive
  drive.mount('/content/gdrive')

  # Setup Symlink to project folder
  DRIVE_PATH = '/content/gdrive/My\ Drive/' + PROJECT
  SYM_PATH = '/content/' + PROJECT
  if not os.path.exists(SYM_PATH):
    !ln -s $DRIVE_PATH $SYM_PATH

os.chdir( '/content/' + PROJECT)

Mounted at /content/gdrive


In [2]:
#@title apt install requirements

!apt update > /dev/null 2>&1
!apt install -y --no-install-recommends \
        xvfb \
        ffmpeg > /dev/null 2>&1

In [3]:
#@title pip install requirements

!pip install pyvirtualdisplay > /dev/null 2>&1
!pip3 install box2d-py > /dev/null 2>&1
!pip install Box2D > /dev/null 2>&1

In [4]:
#@title Setup Virtual Display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f0d6d2f8f10>

# Test if setup is completed and working properly

In [None]:
import gym
from colab_utils import (
    wrap_env,
    show_video
)

env = wrap_env(gym.make("CartPole-v1"))

observation = env.reset()
for i in range(100):
    env.render(mode='rgb_array')
    obs, rew, term, _ = env.step(env.action_space.sample() ) 
    if term:
      break;
            
env.close()
show_video()

# Policy Gradient Algorithm Experimentation

In [13]:
#@title Imports
from importlib import reload
import gym

import pg_agent
import MLP_policy
reload(MLP_policy)
reload(pg_agent)
from pg_agent import *

import utils
reload(utils)

<module 'utils' from '/content/gdrive/My Drive/DeepRL_Experimentation/utils.py'>

## REINFORCE Algorithm Experimentation

In [26]:
env = gym.make("CartPole-v1")

actor_params = {\
"ac_dim": env.action_space.n,\
"ob_dim": env.observation_space.shape[0],\
"n_layers": 5,\
"hidden_layer_size": 100,\
"learning_rate": 1e-4}

agent_params = {\
"actor_params": actor_params}

reinforce_agent = REINFORCEAgent(agent_params)

print(f"E[rewards_sum] before training: {utils.expected_rewards_sum(env, reinforce_agent.actor, env._max_episode_steps, 200)}")

# Training Loop
for i in range(150):
  print(i, end = ",")
  trajectories = utils.sample_trajectories(env, reinforce_agent.actor, env._max_episode_steps, 5000)
  reinforce_agent.train(trajectories)

print(f"\nE[rewards_sum] after training: {utils.expected_rewards_sum(env, reinforce_agent.actor, env._max_episode_steps, 200)}")

E[rewards_sum] before training: 20.25
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,
E[rewards_sum] after training: 49.015


## General Policy Gradient Algorithm Experimentation

In [22]:
env = gym.make("CartPole-v1")

actor_params = {\
"ac_dim": env.action_space.n,\
"ob_dim": env.observation_space.shape[0],\
"n_layers": 5,\
"hidden_layer_size": 100,\
"learning_rate": 1e-4}

agent_params = {\
"actor_params": actor_params}

pg_agent = PGAgent(agent_params)

print(f"E[rewards_sum] before training: {utils.expected_rewards_sum(env, pg_agent.actor, env._max_episode_steps, 200)}")

# Training Loop
for i in range(150):
  print(i, end = ",")
  trajectories = utils.sample_trajectories(env, pg_agent.actor, env._max_episode_steps, 5000)
  pg_agent.train(trajectories)

print(f"\nE[rewards_sum] after training: {utils.expected_rewards_sum(env, pg_agent.actor, env._max_episode_steps, 200)}")

E[rewards_sum] before training: 21.67
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,
E[rewards_sum] after training: 205.29


## Show Results

In [28]:
from colab_utils import (
    wrap_env,
    show_video
)

envw = wrap_env(env)

ob = envw.reset()
for i in range(500):
  envw.render(mode='rgb_array')
  ob, reward, done, _ = envw.step(pg_agent.actor.get_action(ob)) 
  if done:
    break;
            
envw.close()
show_video()