# Enable GPU

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Policy Network


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Policy(nn.Module):
  def __init__(self, in_dim, out_dim, alpha = 0.0001):
    super(Policy, self).__init__()
    self.fc1 = nn.Linear(in_dim, 128)
    self.hidden_act = nn.ReLU()
    self.fc2 = nn.Linear(128, out_dim)
    self.output_act = nn.Softmax(dim = 1)


  def forward(self, state):
    x = self.fc1(state)
    x = self.hidden_act(x)
    x = self.fc2(x)
    x = self.output_act(x)
 
    return x

# REINFORCE with forward and backward update

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np

class REINFORCE(nn.Module):
  def __init__(self, in_dim, out_dim, alpha = 0.0001, gamma = 0.99):
    super(REINFORCE, self).__init__()
    self.gamma = gamma
    self.policy = Policy(in_dim, out_dim)
    self.optimizer = torch.optim.Adam(self.policy.parameters(),lr = alpha)
    self.states = []
    self.rewards = []
    self.actions = []

  def save_episode(self, states, actions, rewards):
    self.states = torch.cat(states, dim = 0).to(device)
    self.actions = torch.tensor(actions).to(device)
    self.rewards = torch.tensor(rewards).to(device)
  
  def get_action(self, state):
    with torch.no_grad():
      prob = self.policy(state)
      distribution = Categorical(probs = prob)
      action = distribution.sample()

    return action
  
  def learn_sum_loss(self, norm_return = False): # Episodic update

    # Reverse the episode to calculate the return recursively
    self.states = self.states.flip(dims = [0])
    self.actions = self.actions.flip(dims = [0])
    self.rewards = self.rewards.flip(dims = [0])

    G = torch.tensor(0).float().to(device)
    returns = []
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0,G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    losses = []
    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()
      losses.append(loss)
    losses = torch.cat(losses).sum().to(device)
    self.optimizer.zero_grad()
    losses.backward()
    self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []

  def learn_backward(self, norm_return = False):

    # Reverse the episode to calculate the return recursively
    self.states = self.states.flip(dims = [0])
    self.actions = self.actions.flip(dims = [0])
    self.rewards = self.rewards.flip(dims = [0])

    G = torch.tensor(0).float().to(device)
    returns = []
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0,G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []
  
  def learn_forward(self, norm_return = False):
    G = torch.tensor(0).float().to(device)
    returns = []
    self.rewards = self.rewards.flip([0])
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []
    



# Wandb is great tool to record machine learning experiment, you can further explore in detail on link below
[Wandb](https://wandb.ai/site?gclid=CjwKCAjwlrqHBhByEiwAnLmYUGy29ZdG460eDefcxyto5hte2XmbYPmr59UQdINKtP18J8w2YbbdFxoCS6UQAvD_BwE)

In [None]:
!pip install wandb
!wandb login

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'direction': {'values': ['backward',]}, 'learning_rate': {'values' : [0.01,0.001,0.0001,0.0003,0.00001]}
                              , 'norm_return': {'value': True}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_CartPole-v1_trajectory_direction')

# Environment requirement for LunarLander


In [None]:
!pip install box2d

Collecting box2d
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 32.6 MB/s eta 0:00:01[K     |▌                               | 20 kB 32.4 MB/s eta 0:00:01[K     |▊                               | 30 kB 20.4 MB/s eta 0:00:01[K     |█                               | 40 kB 17.8 MB/s eta 0:00:01[K     |█▎                              | 51 kB 7.7 MB/s eta 0:00:01[K     |█▌                              | 61 kB 8.4 MB/s eta 0:00:01[K     |█▊                              | 71 kB 7.2 MB/s eta 0:00:01[K     |██                              | 81 kB 8.1 MB/s eta 0:00:01[K     |██▎                             | 92 kB 6.6 MB/s eta 0:00:01[K     |██▌                             | 102 kB 7.2 MB/s eta 0:00:01[K     |██▊                             | 112 kB 7.2 MB/s eta 0:00:01[K     |███                             | 122 kB 7.2 MB/s eta 0:00:01[K     |███▏                            | 133 kB 7.2 MB/s e

#Without Wandb


In [None]:
import gym 
import torch
import time

def train():

  start = time.time()

  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)
  num_ep = 3000
  print_interval = 100
  running_score = -200
  solved_scor = 200

  
  for ep in range(1, num_ep+1):
    state = env.reset()
    score = 0
    done = False
    states = []
    actions = []
    rewards = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      states.append(state)
      actions.append(action.item())
      rewards.append(reward)

      # update score and state
      score += reward
      state = next_state

      if done:
        break

    # save rollout sets
    agent.save_episode(states, actions, rewards)
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    # train the agent
    agent.learn_backward()

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  save_name = 'agent_backward_with_norm_1e-4' + '.pt'
    if running_score >= solved_score:
      print('Environmnet solved at {} episode with running score of {}' .format(ep, running_score))
      break
  torch.save(agent.state_dict(),save_name)
  wandb.save(save_name)

In [None]:
train()

# Learning rate hyper parementer tuning with Wandb sweep function


In [None]:
import gym 
import torch
import time

def train():
  wandb.init(config = {'env':'LunarLander-v2','algorithm:': 'REINFORCE_with_norm_backward_sum_loss' },group = 'LunarLander-v2_learning_rate_tune_with_norm_backward_sum_loss')
  config = wandb.config

  start = time.time()

  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim, alpha= config.learning_rate).to(device)
  num_ep = 3000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    states = []
    actions = []
    rewards = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      states.append(state)
      actions.append(action.item())
      rewards.append(reward)

      # update score and state
      score += reward
      state = next_state

      if done:
        break

    # save rollout sets
    agent.save_episode(states, actions, rewards)
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
    if config.direction == 'backward':     
      agent.learn_backward(norm_return = config.norm_return)
    else:
      agent.learn_forward(norm_return = config.norm_return)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    
    if ep % save_interval == 0:
      save_name = 'agent_' + str(ep) + '.pt'
      torch.save(agent.state_dict(),save_name)
      wandb.save(save_name)
    

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: 4quf8jsh with config:
[34m[1mwandb[0m: 	direction: backward
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	norm_return: True


episode 100 average reward -303.75036757366325, ended at 21.3
episode 200 average reward -124.87854769462783, ended at 55.3
episode 300 average reward -63.90707444798333, ended at 132.9
episode 400 average reward -14.555466310927741, ended at 234.0
episode 500 average reward 19.31030641495198, ended at 300.9
episode 600 average reward 27.141260990588385, ended at 539.5
episode 700 average reward -108.93207940566649, ended at 582.8
episode 800 average reward -21.1449879772891, ended at 606.4
episode 900 average reward 4.8439554627304116, ended at 641.6
episode 1000 average reward 20.96273056617709, ended at 919.5
episode 1100 average reward -73.3584114764823, ended at 987.8
episode 1200 average reward -4.223173410563373, ended at 1063.4
episode 1300 average reward -120.92889113085734, ended at 1096.5
episode 1400 average reward -151.86830836553668, ended at 1122.9
episode 1500 average reward -124.20857550517574, ended at 1146.3
episode 1600 average reward -260.07342730049254, ended at 1

VBox(children=(Label(value=' 0.60MB of 0.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,126.6632
_runtime,2339.0
_timestamp,1626313895.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▄▆▆▆▆▆▆▅▆▆▇▆▆▇▆▄▄▅▃▃▂▁▃▄▇█▆█▇▆▄▅▅▆▇▇▆█
_runtime,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: vsewpev5 with config:
[34m[1mwandb[0m: 	direction: backward
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	norm_return: True


episode 100 average reward -134.29514254604538, ended at 29.0
episode 200 average reward -137.02887655916405, ended at 58.4
episode 300 average reward -126.90849370197168, ended at 94.9
episode 400 average reward -92.75275789936575, ended at 142.5
episode 500 average reward -71.39938548417969, ended at 198.3
episode 600 average reward -106.96756377970522, ended at 297.0
episode 700 average reward -49.426055465640104, ended at 409.9
episode 800 average reward -38.34184171026469, ended at 515.6
episode 900 average reward 5.904744088961824, ended at 642.7
episode 1000 average reward 6.466768783909518, ended at 856.8
episode 1100 average reward 46.335942682088344, ended at 1063.3
episode 1200 average reward 25.47871866745521, ended at 1239.9
episode 1300 average reward 53.35665812747767, ended at 1446.0
episode 1400 average reward 72.31874859627632, ended at 1703.2
episode 1500 average reward 82.82738566620671, ended at 2014.4
episode 1600 average reward 108.5181212950028, ended at 2324.0


VBox(children=(Label(value=' 0.61MB of 0.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,219.37514
_runtime,5459.0
_timestamp,1626319361.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▁▂▂▂▃▂▂▃▃▃▄▄▅▄▄▅▅▅▆▆▆▅▅▆▆▆▆▆▆▆▆▆▇▇██▇▇
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇██████
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇██████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: zj5wgsi4 with config:
[34m[1mwandb[0m: 	direction: backward
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	norm_return: True


episode 100 average reward -164.72856198270438, ended at 24.6
episode 200 average reward -183.54893007070658, ended at 49.5
episode 300 average reward -175.15720948592775, ended at 74.5
episode 400 average reward -136.80611869461896, ended at 103.8
episode 500 average reward -167.567720902479, ended at 129.4
episode 600 average reward -164.28390855442757, ended at 155.6
episode 700 average reward -141.1740908481412, ended at 180.5
episode 800 average reward -144.50140495339457, ended at 206.7
episode 900 average reward -152.3876621861411, ended at 233.6
episode 1000 average reward -124.46374071405623, ended at 259.5
episode 1100 average reward -181.43671132591865, ended at 286.0
episode 1200 average reward -155.91614710299675, ended at 317.0
episode 1300 average reward -140.86584929193776, ended at 345.8
episode 1400 average reward -124.3608725391204, ended at 373.7
episode 1500 average reward -147.14850101016836, ended at 402.6
episode 1600 average reward -135.3975456639629, ended at 

VBox(children=(Label(value=' 0.62MB of 0.62MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,-109.25307
_runtime,907.0
_timestamp,1626320275.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃▃▃▃▅▄▁▁▃▆▅▅▃▆▁▅▄▅▆▅▅▆▄▅▅▃▅▆▅▆▅▇▇▅▇█▇▇▆█
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: zi5j9uie with config:
[34m[1mwandb[0m: 	direction: backward
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	norm_return: True


episode 100 average reward -159.06387039071151, ended at 28.9
episode 200 average reward -181.3431972361139, ended at 55.6
episode 300 average reward -160.16067359041085, ended at 82.0
episode 400 average reward -147.67231233776636, ended at 109.7
episode 500 average reward -149.8437460257715, ended at 139.0
episode 600 average reward -130.42763445143748, ended at 168.1
episode 700 average reward -139.74931243893477, ended at 201.9
episode 800 average reward -137.7861057961989, ended at 235.3
episode 900 average reward -133.2849392767632, ended at 268.0
episode 1000 average reward -82.07091493428936, ended at 304.2
episode 1100 average reward -92.44248167633138, ended at 340.0
episode 1200 average reward -76.87109183523879, ended at 375.6
episode 1300 average reward -74.14915720542636, ended at 417.0
episode 1400 average reward -81.39843444679369, ended at 464.0
episode 1500 average reward -54.20164263117405, ended at 518.8
episode 1600 average reward -62.76601566528076, ended at 576.7

VBox(children=(Label(value=' 0.64MB of 0.64MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,0.49729
_runtime,2508.0
_timestamp,1626322790.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▂▂▂▂▁▂▂▂▂▄▃▄▃▄▅▅▄▅▅▅▅▃▅▆▆▆▆▆▆▆▇▇▇▇▇█▇█
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: guknz0ao with config:
[34m[1mwandb[0m: 	direction: backward
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	norm_return: True


episode 100 average reward -208.60064287858745, ended at 22.5
episode 200 average reward -185.7003518889347, ended at 46.5
episode 300 average reward -215.8511851725121, ended at 69.6
episode 400 average reward -162.87546545804145, ended at 93.3
episode 500 average reward -169.4592676827227, ended at 117.2
episode 600 average reward -205.85435721002295, ended at 140.8
episode 700 average reward -140.94634505187648, ended at 164.0
episode 800 average reward -187.3608647027201, ended at 189.8
episode 900 average reward -174.61129462061092, ended at 213.3
episode 1000 average reward -201.37417945768257, ended at 237.2
episode 1100 average reward -181.42273713352273, ended at 260.7
episode 1200 average reward -173.6218164789308, ended at 284.3
episode 1300 average reward -187.8597980755604, ended at 307.6
episode 1400 average reward -175.05629952547199, ended at 332.2
episode 1500 average reward -159.457675868082, ended at 356.1
episode 1600 average reward -197.1852464176528, ended at 379.

VBox(children=(Label(value=' 0.65MB of 0.65MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,-179.74673
_runtime,734.0
_timestamp,1626323532.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▃▄▇▃▅▃▃▂▇▄▃▄▃▁▂▂▅▅▆▃▃▄▄▄▃▅▅▅▄▄▅▄▅▄▇▆▅▇▃
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# You can check out the result for comparing forward and backward algorithm for REINFORCE with hyper parameter tuning

[Link for Report](https://https://wandb.ai/ko120/REINFORCE_CartPole-v1_trajectory_direction/reports/REINFORCE-Updating-direction-variation--Vmlldzo4NDMxMDQ)

# Visualize agent before train

In [None]:
# For visualization (rendering OpenAI Gym)
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install ffmpeg freeglut3-dev xvfb  
!pip install -U colabgymrender

In [None]:
# For visualization, must set this up to make virtual displaying screen on Colab, otherwise, it fails
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'

torch.manual_seed(543)

env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()

# Visualize best performing forward trained agent (lr= 1e-4) without normalization return


In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'

torch.manual_seed(543)

env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

agent.load_state_dict(torch.load('agent_forward_withoutnorm_1e-4.pt'))
agent.eval()


state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)3268608/45929032 bytes (7.1%)7217152/45929032 bytes (15.7%)11198464/45929032 bytes (24.4%)15138816/45929032 bytes (33.0%)18882560/45929032 bytes (41.1%)22904832/45929032 bytes (49.9%)26886144/45929032 bytes (58.5%)30490624/45929032 bytes (66.4%)34627584/45929032 bytes (75.4%)38584320/45929032 bytes (84.0%)42614784/45929032 bytes (92.8%)45929032/45929032 bytes (100.0%)
  Done
File saved as /root

# Visualize best performing backward trained agent (lr= 1e-4) with normalization return


In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'



env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

agent.load_state_dict(torch.load('agent_backward_with_norm_1e-4.pt'))
agent.eval()

state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()