<a href="https://colab.research.google.com/github/ligerre/firsttime/blob/main/PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
env = gym.make("Pendulum-v1")

  deprecation(
  deprecation(


In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
from torch.distributions import MultivariateNormal, Categorical
import time

  and should_run_async(code)


In [3]:
class FeedForwardNN(nn.Module):
  def __init__(self,in_dim, out_dim):
    super(FeedForwardNN, self).__init__()
    self.layer1 = nn.Linear(in_dim, 64)
    self.layer2 = nn.Linear(64, 64)
    self.layer3 = nn.Linear(64, out_dim)
  def forward(self, obs):
  # Convert observation to tensor if it's a numpy array
    if isinstance(obs, np.ndarray):
      obs = torch.tensor(obs, dtype=torch.float)
    activation1 = F.relu(self.layer1(obs))
    activation2 = F.relu(self.layer2(activation1))
    output = self.layer3(activation2)
    return output

  and should_run_async(code)


In [4]:
class PPO:
  def __init__(self, env):
    self._init_hyperparameters()

    self.env =env
    self.obs_dim = env.observation_space.shape[0]
    self.act_dim = env.action_space.shape[0]

    self.actor = FeedForwardNN(self.obs_dim,self.act_dim)
    self.critic = FeedForwardNN(self.obs_dim,1)
    # Initialize optimizers for actor and critic

    self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
    self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
    
    # Initialize the covariance matrix used to query the actor for actions
    self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
    self.cov_mat = torch.diag(self.cov_var)
  
    # This logger will help us with printing out summaries of each iteration
    self.logger = {
			'delta_t': time.time_ns(),
			't_so_far': 0,          # timesteps so far
			'i_so_far': 0,          # iterations so far
			'batch_lens': [],       # episodic lengths in batch
			'batch_rews': [],       # episodic returns in batch
			'actor_losses': [],     # losses of actor network in current iteration
		}

  def _init_hyperparameters(self):
    self.timesteps_per_batch = 4800            # timesteps per batch
    self.max_timesteps_per_episode = 1600      # timesteps per episode
    self.gamma = 0.95
    self.n_updates_per_iteration = 5
    self.clip = 0.2
    self.lr = 0.005

  def get_action(self, obs):
    # Query the actor network for a mean action.
    # Same thing as calling self.actor.forward(obs)
    mean = self.actor(obs)
    # Create our Multivariate Normal Distribution
    dist = MultivariateNormal(mean, self.cov_mat)
    action = dist.sample()
    #action = np.argmax(mean)
    #print(action)
    log_prob = dist.log_prob(action)
    #print(log_prob)  
    # Return the sampled action and the log prob of that action
    return action.detach().numpy(), log_prob.detach()

  def learn(self, total_timestep):
    t_so_far = 0 # Timesteps simulated so far

    while t_so_far<total_timestep:
      batch_obs,batch_acts,batch_log_probs,batch_rews,batch_rtgs,batch_lens = self.rollout()

      V, _ = self.evaluate(batch_obs,batch_acts)

      A_k = batch_rtgs-V.detach()
      A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

      for _ in range(self.n_updates_per_iteration):
        # Calculate pi_theta(a_t | s_t)
        V, curr_log_probs = self.evaluate(batch_obs, batch_acts)
        ratios = torch.exp(curr_log_probs - batch_log_probs)
        surr1 = ratios * A_k
        surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

        actor_loss = (-torch.min(surr1, surr2)).mean()
        critic_loss = nn.MSELoss()(V, batch_rtgs)

        # Calculate gradients and perform backward propagation for actor 
        # network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        self.critic_optim.zero_grad()    
        critic_loss.backward()    
        self.critic_optim.step()

      t_so_far += np.sum(batch_lens)

  def rollout(self):
    #batch data
    batch_obs=[]
    batch_acts=[]
    batch_log_probs = []       # log probs of each action
    batch_rews = []            # batch rewards
    batch_rtgs = []            # batch rewards-to-go
    batch_lens = []            # episodic lengths in batch

    t=0
    while t<self.timesteps_per_batch:
      ep_reward = []

      obs = env.reset()
      done = False

      for ep_t in range(self.max_timesteps_per_episode):
        t+=1
        batch_obs.append(obs)
        action, log_prob = self.get_action(obs)
        
        obs, rew, done, _ = self.env.step(action)

        batch_acts.append(action)
        batch_log_probs.append(log_prob)

        ep_reward.append(rew)
        if done:
          break
      # Collect episodic length and rewards

      batch_lens.append(ep_t + 1) # plus 1 because timestep starts at 0
      batch_rews.append(ep_reward)
    # Reshape data as tensors in the shape specified before returning
    batch_obs = torch.tensor(batch_obs, dtype=torch.float)
    batch_acts = torch.tensor(batch_acts, dtype=torch.float)
    batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
    # ALG STEP #4
    batch_rtgs = self.compute_rtgs(batch_rews)
    # Return the batch data
    return batch_obs, batch_acts, batch_log_probs,batch_rews, batch_rtgs, batch_lens
  def compute_rtgs(self, batch_rews):
    # The rewards-to-go (rtg) per episode per batch to return.
    # The shape will be (num timesteps per episode)
    batch_rtgs = []
    # Iterate through each episode backwards to maintain same order
    # in batch_rtgs
    for ep_rews in reversed(batch_rews):
      discounted_reward = 0 # The discounted reward so far
      for rew in reversed(ep_rews):
        discounted_reward = rew + discounted_reward * self.gamma
        batch_rtgs.insert(0, discounted_reward)
    # Convert the rewards-to-go into a tensor
    batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
    return batch_rtgs
  def evaluate(self, batch_obs, batch_acts):
    # Query critic network for a value V for each obs in batch_obs.
    mean = self.actor(batch_obs)
    dist = MultivariateNormal(mean, self.cov_mat)
    log_probs = dist.log_prob(batch_acts)

    V = self.critic(batch_obs).squeeze()
    return V, log_probs
      


 


In [5]:
model = PPO(env)
print(model.cov_mat)
model.learn(200)

tensor([[0.5000]])


  batch_obs = torch.tensor(batch_obs, dtype=torch.float)


In [6]:
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [7]:
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.*

Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+5).
xvfb is already the newest version (2:1.20.13-1ubuntu1~20.04.8).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from pyvirtualdisplay import Display
display = Display(visible=False, size=(1400, 900))
_ = display.start()

In [9]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder
before_training = "before_training.mp4"

video = VideoRecorder(env, before_training)
# returns an initial observation
observation=env.reset()
i=0
while True:
  env.render()
  i+=1
  video.capture_frame()
  #observation = observation.reshape(1, -1)
  action,log_prob = model.get_action(observation)
  observation, reward, done, _ = env.step(action)
  # Not printing this time
  #print("step", i, observation, done)
  if done:
    break
video.close()
env.close()

  and should_run_async(code)
  logger.deprecation(
  logger.deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [10]:
from base64 import b64encode
def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'

  and should_run_async(code)


In [11]:
from IPython.display import HTML
html = render_mp4(before_training)
HTML(html)
