# Enable GPU

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Actor Critic Share Network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCritic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(ActorCritic_Net , self).__init__()
    self.fc1 = nn.Linear(input_dims,fc1_dims)
    self.actor = nn.Linear(fc1_dims, output_dims)
    self.critic = nn.Linear(fc1_dims,1)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    pi = F.softmax(self.actor(x), dim = 1)
    value = self.critic(x)
    return (pi, value)



# Actor Critic Seperate Net

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Actor_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Actor_Net,self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, output_dims)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.softmax(self.out(x), dim = 1)

    return x

class Critic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Critic_Net, self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, 1)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)
  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = self.out(x)
    return x

# REINFORCE with Baseline Agent

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical 
import numpy as np


class ActorCritic(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128, gamma = 0.99, lr = 1e-4):
    super(ActorCritic, self).__init__()
    self.ac_net = ActorCritic_Net(input_dims= input_dims, output_dims= output_dims, fc1_dims = fc1_dims)
    self.optimizer = optim.Adam(params= self.ac_net.parameters(), lr = lr)
    self.gamma = gamma

    self.log_probs = []
    self.values = []
  
  def get_action(self, state):
    pi, v = self.ac_net(state)
    distribution = Categorical(probs = pi)
    action = distribution.sample()
    self.log_probs.append(distribution.log_prob(action))
    self.values.append(v)

    return action


  def get_action(self, state):
    pi, v = self.ac_net(state)
    distribution = Categorical(probs = pi)
    action = distribution.sample()
    self.log_probs.append(distribution.log_prob(action))
    self.values.append(v)

    return action


  def learn_mean(self, rewards ,states, actions, return_norm = True):
    
  
    returns = []
    actor_losses = []
    critic_losses = []

    # Calculate returns
    G = 0
    for reward in rewards[::-1]:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Resize the vectors
  
    self.values = torch.cat(self.values).squeeze() # concatinating plus squeeze since input dim is 2d
    self.log_probs = torch.cat(self.log_probs) # only concatinating since input dim is 1d

    # Compute actor and critic losses
    
    for G, log_prob, v in zip(returns, self.log_probs, self.values):
      
      G = G.detach() 
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_losses.append(-log_prob * advantage)
      critic_losses.append(F.smooth_l1_loss(v, G))
    self.optimizer.zero_grad()
    loss = (torch.stack(actor_losses).sum()).mean() + (torch.stack(critic_losses).sum()).mean()
    loss.backward()
    self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_forward(self, rewards, states, actions, return_norm = True):
    
  
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi, v = self.ac_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage
      critic_loss = F.smooth_l1_loss(v, torch.tensor([G]).unsqueeze(0).to(device))
      self.optimizer.zero_grad()
      loss = actor_loss + critic_loss
      loss.backward()
      self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_backward(self, rewards, states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
    actions = torch.tensor(actions).to(device).flip(dims = [0])
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi, v = self.ac_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage
      critic_loss = F.smooth_l1_loss(v, torch.tensor([G]).unsqueeze(0).to(device)).unsqueeze(0)
      self.optimizer.zero_grad()
      assert actor_loss.size() == critic_loss.size()
      loss = actor_loss + critic_loss
      loss.backward()
      self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []


#Environment requiremnt for LunarLander -v2

In [None]:
!pip install swig
!pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Without Wandb

In [None]:
import gym
import torch
import time
import pdb
import wandb

def train():
  wandb.init(config = {'env':'LunarLander-v2','algorithm:': 'REINFORCE_Baseline_forward','architecture': 'seperate','num_laeyrs':'2'}, project = 'REINFORCE_Baseline_seperate_net_LunarLander-v2',group = 'REINFORCE_Baseline_with_128_seperate_LunarLander-v2')
  start = time.time()
  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, fc1_dims= 128, lr = 0.0001).to(device)
  num_ep = 3000
  print_interval = 100
  running_score = 10

  for ep in range(num_ep):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())
      # update score and state
      score += reward
      state = next_state

      if done:
        break

  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})
    # train the agent
    #pdb.set_trace()
    agent.learn_forward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  

In [None]:
train() 

VBox(children=(Label(value='0.160 MB of 0.160 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▃▂▃▃▅▅▅▇▆▇▇▆▇▆▇▇█▇█▇█▇██▇▇█▇▇▇█▇███

0,1
episode,2999.0
running_score,188.31003


  deprecation(
  deprecation(
  deprecation(


episode 0 average reward 4.90757931316575, ended at 0.3
episode 100 average reward -154.7003953461933, ended at 40.3
episode 200 average reward -130.8306625561131, ended at 80.5
episode 300 average reward -120.865426746906, ended at 131.9
episode 400 average reward -127.00624485925636, ended at 205.0
episode 500 average reward -69.92539738236269, ended at 293.7
episode 600 average reward -32.74572212994992, ended at 414.4
episode 700 average reward -15.474055639967391, ended at 600.2
episode 800 average reward 1.039568641435228, ended at 831.5
episode 900 average reward 14.917366459064779, ended at 1095.8
episode 1000 average reward 41.37915629090298, ended at 1267.7
episode 1100 average reward 18.8907318705555, ended at 1435.8
episode 1200 average reward 27.92161920217364, ended at 1603.2
episode 1300 average reward 43.00665248672294, ended at 1802.4
episode 1400 average reward 13.01791170609769, ended at 1954.5
episode 1500 average reward 53.89668157802253, ended at 2201.5
episode 16

With Wandb

In [None]:
!pip install wandb
!wandb login

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'learning': {'values': ['learn_forward', 'learn_backward']}, 'actor_learning_rate': {'values' : [0.01, 0.001, 0.0001,0.0003,0.00001]}, 'critic_learning_rate' : {'values': [0.01, 0.001, 0.0001, 0.0003, 0.00001]}
                              , 'num_neurons': {'value': 128 }, 'optimizer': {'values' : ['Adam']}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_Baseline_seperate_net_LunarLander-v2')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: y4vaeztt
Sweep URL: https://wandb.ai/ko120/REINFORCE_Baseline_seperate_net_LunarLander-v2/sweeps/y4vaeztt


In [None]:
import gym 
import torch
import time
import wandb

def train():
  wandb.init(config = {'env':'LunarLander-v2','algorithm:': 'REINFORCE_Baseline','architecture': 'seperate','num_laeyrs':'2'}, project = 'REINFORCE_Baseline_seperate_net_LunarLander-v2',group = 'REINFORCE_Baseline_with_128_seperate_LunarLander-v2')
  config = wandb.config
  start = time.time()

  env = gym.make('LunarLander-v2')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, ac_lr = config.actor_learning_rate, cr_lr = config.critic_learning_rate, optimizer = config.optimizer).to(device)
  num_ep = 3000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []
    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())

      # update score and state
      score += reward
      state = next_state

      if done:
        break
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
  
    if config.learning == 'learn_mean':
      agent.learn_mean(rewards,states, actions, return_norm = True)
    elif config.learning == 'learn_forward':
      agent.learn_forward(rewards, states, actions, return_norm = True)
    elif config.learning == 'learn_backward':
      agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    

    if ep == num_ep:
      dummy_input = torch.rand(1,4).to(device)
      torch.onnx.export(agent.actor_net,dummy_input,'final_actor.onnx')
      torch.onnx.export(agent.critic_net,dummy_input, 'final_critic.onnx')
      wandb.save('final_actor.onnx')
      wandb.save('final_critic.onnx')
      torch.save(agent.actor_net.state_dict(),'final_actor.pt')
      wandb.save('final_actor.pt')
      torch.save(agent.critic_net.state_dict(),'final_critic.pt')
      wandb.save('final_critic.pt')
    

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: vqaap045 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mko120[0m. Use [1m`wandb login --relogin`[0m to force relogin


  deprecation(
  deprecation(
  deprecation(


episode 100 average reward -611.4769293973748, ended at 34.5
episode 200 average reward -925.7033412410781, ended at 90.4
episode 300 average reward -755.9739274822316, ended at 141.7
episode 400 average reward -1223.870050008948, ended at 192.4
episode 500 average reward -954.0698186220156, ended at 242.1
episode 600 average reward -792.4857167928919, ended at 292.5
episode 700 average reward -845.8460096218193, ended at 343.3
episode 800 average reward -913.9222422183469, ended at 394.9
episode 900 average reward -793.9745881372224, ended at 444.3
episode 1000 average reward -817.3260974856521, ended at 493.0
episode 1100 average reward -781.9977044996244, ended at 544.6
episode 1200 average reward -781.1797906379461, ended at 595.3
episode 1300 average reward -838.621673967382, ended at 643.6
episode 1400 average reward -936.5563570756482, ended at 698.8
episode 1500 average reward -877.8846770013736, ended at 750.3
episode 1600 average reward -803.0224726619253, ended at 800.2
epis

VBox(children=(Label(value='0.001 MB of 0.038 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.019265…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▅▂▂▃▄▃▃▃▂▃▃▃▃▃▂▃▃▂▂▄▃▄▂▃▁▂▃▃▃▂▃▃▃▃▃▂▄▂▄

0,1
episode,3000.0
running_score,-751.61846


Run vqaap045 errored: RuntimeError('mat1 and mat2 shapes cannot be multiplied (1x4 and 8x128)')
[34m[1mwandb[0m: [32m[41mERROR[0m Run vqaap045 errored: RuntimeError('mat1 and mat2 shapes cannot be multiplied (1x4 and 8x128)')
[34m[1mwandb[0m: Agent Starting Run: mi3crc08 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward -537.5039225203337, ended at 26.6
episode 200 average reward -511.5372740474345, ended at 53.0
episode 300 average reward -594.300630174898, ended at 80.2
episode 400 average reward -608.6068338859022, ended at 107.8
episode 500 average reward -538.8779530088104, ended at 135.3
episode 600 average reward -568.8609763943414, ended at 162.8
episode 700 average reward -560.6892442156437, ended at 189.8
episode 800 average reward -560.9704819369373, ended at 216.4
episode 900 average reward -586.293070690808, ended at 243.4
episode 1000 average reward -510.9530816105201, ended at 270.3
episode 1100 average reward -601.5276531726154, ended at 297.8
