# Enable GPU

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Policy Network


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Policy(nn.Module):
  def __init__(self, in_dim, out_dim, alpha = 0.0001):
    super(Policy, self).__init__()
    self.fc1 = nn.Linear(in_dim, 128)
    self.hidden_act = nn.ReLU()
    self.fc2 = nn.Linear(128, out_dim)
    self.output_act = nn.Softmax(dim = 1)


  def forward(self, state):
    x = self.fc1(state)
    x = self.hidden_act(x)
    x = self.fc2(x)
    x = self.output_act(x)
 
    return x

# REINFORCE with forward and backward update

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np

class REINFORCE(nn.Module):
  def __init__(self, in_dim, out_dim, alpha = 0.0001, gamma = 0.99):
    super(REINFORCE, self).__init__()
    self.gamma = gamma
    self.policy = Policy(in_dim, out_dim)
    self.optimizer = torch.optim.Adam(self.policy.parameters(),lr = alpha)
    self.states = []
    self.rewards = []
    self.actions = []

  def save_episode(self, states, actions, rewards):
    self.states = torch.cat(states, dim = 0).to(device)
    self.actions = torch.tensor(actions).to(device)
    self.rewards = torch.tensor(rewards).to(device)
  
  def get_action(self, state):
    with torch.no_grad():
      prob = self.policy(state)
      distribution = Categorical(probs = prob)
      action = distribution.sample()

    return action
  
  def learn_sum_loss(self, norm_return = False): # Episodic update

    # Reverse the episode to calculate the return recursively
    self.states = self.states.flip(dims = [0])
    self.actions = self.actions.flip(dims = [0])
    self.rewards = self.rewards.flip(dims = [0])

    G = torch.tensor(0).float().to(device)
    returns = []
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0,G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    losses = []
    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()
      losses.append(loss)
    losses = torch.cat(losses).sum().to(device)
    self.optimizer.zero_grad()
    losses.backward()
    self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []

  def learn_backward(self, norm_return = False):

    # Reverse the episode to calculate the return recursively
    self.states = self.states.flip(dims = [0])
    self.actions = self.actions.flip(dims = [0])
    self.rewards = self.rewards.flip(dims = [0])

    G = torch.tensor(0).float().to(device)
    returns = []
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0,G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []
  
  def learn_forward(self, norm_return = False):
    G = torch.tensor(0).float().to(device)
    returns = []
    self.rewards = self.rewards.flip([0])
    for reward in self.rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns)
    eps = np.finfo(np.float32).eps.item()
    if norm_return:
      returns = (returns - returns.mean()) / (returns.std() + eps)

    for state, action, G  in zip(self.states, self.actions, returns):
      state = state.unsqueeze(0)
      probs = self.policy(state)
      distr = Categorical(probs = probs)
      log_prob = distr.log_prob(action)
      loss = -log_prob * G.detach()
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    # clear the memory
    self.states = []
    self.actions = []
    self.rewards = []
    



# Wandb is great tool to record machine learning experiment, you can further explore in detail on link below
[Wandb](https://wandb.ai/site?gclid=CjwKCAjwlrqHBhByEiwAnLmYUGy29ZdG460eDefcxyto5hte2XmbYPmr59UQdINKtP18J8w2YbbdFxoCS6UQAvD_BwE)

In [None]:
!pip install wandb
!wandb login

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'direction': {'values': ['backward',]}, 'learning_rate': {'values' : [0.01,0.001,0.0001,0.0003,0.00001]}
                              , 'norm_return': {'value': True}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_CartPole-v1_trajectory_direction')

# Environment requirement for LunarLander


In [None]:
!pip install box2d

#Without Wandb


In [None]:
import gym 
import torch
import time

def train():

  start = time.time()

  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)
  num_ep = 3000
  print_interval = 100
  running_score = -200
  solved_scor = 200

  
  for ep in range(1, num_ep+1):
    state = env.reset()
    score = 0
    done = False
    states = []
    actions = []
    rewards = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      states.append(state)
      actions.append(action.item())
      rewards.append(reward)

      # update score and state
      score += reward
      state = next_state

      if done:
        break

    # save rollout sets
    agent.save_episode(states, actions, rewards)
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    # train the agent
    agent.learn_backward()

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  save_name = 'agent_backward_with_norm_1e-4' + '.pt'
    if running_score >= solved_score:
      print('Environmnet solved at {} episode with running score of {}' .format(ep, running_score))
      break
  torch.save(agent.state_dict(),save_name)
  wandb.save(save_name)

In [None]:
train()

# Learning rate hyper parementer tuning with Wandb sweep function


In [None]:
import gym 
import torch
import time

def train():
  wandb.init(config = {'env':'LunarLander-v2','algorithm:': 'REINFORCE_with_norm_backward_sum_loss' },group = 'LunarLander-v2_learning_rate_tune_with_norm_backward_sum_loss')
  config = wandb.config

  start = time.time()

  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim, alpha= config.learning_rate).to(device)
  num_ep = 3000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    states = []
    actions = []
    rewards = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      states.append(state)
      actions.append(action.item())
      rewards.append(reward)

      # update score and state
      score += reward
      state = next_state

      if done:
        break

    # save rollout sets
    agent.save_episode(states, actions, rewards)
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
    if config.direction == 'backward':     
      agent.learn_backward(norm_return = config.norm_return)
    else:
      agent.learn_forward(norm_return = config.norm_return)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    
    if ep % save_interval == 0:
      save_name = 'agent_' + str(ep) + '.pt'
      torch.save(agent.state_dict(),save_name)
      wandb.save(save_name)
    

In [None]:
wandb.agent(sweep_id, train)

# You can check out the result for comparing forward and backward algorithm for REINFORCE with hyper parameter tuning

[Link for Report](https://https://wandb.ai/ko120/REINFORCE_CartPole-v1_trajectory_direction/reports/REINFORCE-Updating-direction-variation--Vmlldzo4NDMxMDQ)

# Visualize agent before train

In [None]:
# For visualization (rendering OpenAI Gym)
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install ffmpeg freeglut3-dev xvfb  
!pip install -U colabgymrender

In [None]:
# For visualization, must set this up to make virtual displaying screen on Colab, otherwise, it fails
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'

torch.manual_seed(543)

env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()

# Visualize best performing forward trained agent (lr= 1e-4) without normalization return


In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'

torch.manual_seed(543)

env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

agent.load_state_dict(torch.load('agent_forward_withoutnorm_1e-4.pt'))
agent.eval()


state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()

# Visualize best performing backward trained agent (lr= 1e-4) with normalization return


In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("LunarLander-v2")
directory = '/content/videos'



env = Recorder(env, directory)


state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
agent = REINFORCE(in_dim= state_dim,  out_dim = action_dim).to(device)

agent.load_state_dict(torch.load('agent_backward_with_norm_1e-4.pt'))
agent.eval()

state = env.reset()
terminal = False
while not terminal:
  state = torch.tensor(state).unsqueeze(0).to(device)
  action = agent.get_action(state)
  next_state, reward, terminal, info = env.step(action.item())
  state = next_state

env.play()