# Enable GPU

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Actor Critic Share Network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCritic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(ActorCritic_Net , self).__init__()
    self.fc1 = nn.Linear(input_dims,fc1_dims)
    self.actor = nn.Linear(fc1_dims, output_dims)
    self.critic = nn.Linear(fc1_dims,1)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    pi = F.softmax(self.actor(x), dim = 1)
    value = self.critic(x)
    return (pi, value)



# Actor Critic Seperate Net

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Actor_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Actor_Net,self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, output_dims)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.softmax(self.out(x), dim = 1)

    return x

class Critic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Critic_Net, self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, 1)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)
  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = self.out(x)
    return x

# REINFORCE with Baseline Agent

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical 
import numpy as np

class ActorCritic(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128, gamma = 0.99, ac_lr = 1e-3, cr_lr = 1e-2, optimizer = 'RMSprop'):
    super(ActorCritic, self).__init__()
    self.actor_net = Actor_Net(input_dims= input_dims, output_dims= output_dims, fc1_dims= fc1_dims)
    self.critic_net = Critic_Net(input_dims= input_dims, output_dims = output_dims, fc1_dims= fc1_dims)
    if optimizer == 'RMSprop':
      self.actor_optim = optim.RMSprop(params = self.actor_net.parameters(),lr = ac_lr)
      self.critic_optim = optim.RMSprop(params= self.critic_net.parameters(), lr = cr_lr)
    else:
      self.actor_optim = optim.Adam(params = self.actor_net.parameters(),lr = ac_lr)
      self.critic_optim = optim.Adam(params= self.critic_net.parameters(), lr = cr_lr)

    self.gamma = gamma

    self.log_probs = []
    self.values = []
  
  def get_action(self, state):
    with torch.no_grad():
      pi = self.actor_net(state)
      distribution = Categorical(probs = pi)
      action = distribution.sample()
      self.log_probs.append(distribution.log_prob(action))

    return action


  def learn_mean(self, rewards ,states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    actor_losses = []
    critic_losses = []

    # Calculate returns
    G = 0
    for reward in rewards[::-1]:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)


    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Resize the vectors
  
    #self.values = torch.cat(self.values).squeeze() # concatinating plus squeeze since input dim is 2d
    self.log_probs = torch.cat(self.log_probs) # only concatinating since input dim is 1d

    # Compute actor and critic losses
    
    for G, log_prob, state in zip(returns, self.log_probs, states):
      G = G.detach() 
      v = self.critic_net(state)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_losses.append(-log_prob * advantage.detach())
      critic_losses.append(F.smooth_l1_loss(v, torch.tensor([G]).to(device)))

    self.critic_optim.zero_grad()
    self.actor_optim.zero_grad()
    critic_losses = torch.stack(critic_losses).to(device).mean()
    critic_losses.backward()
    actor_losses = torch.stack(actor_losses).to(device).mean()
    actor_losses.backward()
    self.critic_optim.step()
    self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_forward(self, rewards, states, actions, return_norm = True):
    
  
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi = self.actor_net(state)
      v = self.critic_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage.detach()
      critic_loss = F.smooth_l1_loss(v.squeeze(0), torch.tensor([G]).to(device).detach()).unsqueeze(0)
      self.critic_optim.zero_grad()
      self.actor_optim.zero_grad()
      critic_loss.backward()
      actor_loss.backward()
      self.critic_optim.step()
      self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_backward(self, rewards, states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
    actions = torch.tensor(actions).to(device).flip(dims = [0])
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi = self.actor_net(state)
      v = self.critic_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.detach() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage.detach()
      critic_loss = F.smooth_l1_loss(v.squeeze(0), torch.tensor([G]).to(device).detach()).unsqueeze(0)
      self.critic_optim.zero_grad()
      critic_loss.backward()
      self.critic_optim.step()
      self.actor_optim.zero_grad()
      actor_loss.backward()
      self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []


# Without Wandb

In [None]:
import gym
import torch
import time
import pdb

def train():

  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, ac_lr = 1e-3, cr_lr = 0.01, optimizer= 'Adam').to(device)
  num_ep = 1000
  print_interval = 100
  running_score = 10

  for ep in range(num_ep):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())
      # update score and state
      score += reward
      state = next_state

      if done:
        break

  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    # train the agent
    pdb.set_trace()
    agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  

In [None]:
train()

With Wandb

In [None]:
!pip install wandb
!wandb login

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'learning': {'values': ['learn_mean','learn_forward', 'learn_backward']}, 'actor_learning_rate': {'values' : [0.01, 0.001, 0.0001,0.0003,0.00001]}, 'critic_learning_rate' : {'values': [0.01, 0.001, 0.0001, 0.0003, 0.00001]}
                              , 'num_neurons': {'value': 128 }, 'optimizer': {'values' : ['Adam']}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_Baseline_seperate_net')

In [None]:
import gym 
import torch
import time
import wandb

def train():
  wandb.init(config = {'env':'CartPole-v1','algorithm:': 'REINFORCE_Baseline','architecture': 'seperate','num_laeyrs':'2'}, project = 'REINFORCE_Baseline_seperate_net',group = 'Cart_REINFORCE_Baseline_with_128_seperate')
  config = wandb.config
  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, ac_lr = config.actor_learning_rate, cr_lr = config.critic_learning_rate, optimizer = config.optimizer).to(device)
  num_ep = 1000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []
    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())

      # update score and state
      score += reward
      state = next_state

      if done:
        break
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
  
    if config.learning == 'learn_mean':
      agent.learn_mean(rewards,states, actions, return_norm = True)
    elif config.learning == 'learn_forward':
      agent.learn_forward(rewards, states, actions, return_norm = True)
    elif config.learning == 'learn_backward':
      agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    

    if ep == num_ep:
      dummy_input = torch.rand(1,4).to(device)
      torch.onnx.export(agent.actor_net,dummy_input,'final_actor.onnx')
      torch.onnx.export(agent.critic_net,dummy_input, 'final_critic.onnx')
      wandb.save('final_actor.onnx')
      wandb.save('final_critic.onnx')
      torch.save(agent.actor_net.state_dict(),'final_actor.pt')
      wandb.save('final_actor.pt')
      torch.save(agent.critic_net.state_dict(),'final_critic.pt')
      wandb.save('final_critic.pt')
    

In [None]:
wandb.agent(sweep_id, train)

# You can see the result here!
[Report Link](https://wandb.ai/ko120/REINFORCE_Baseline/reports/REINFORCE-with-Baseline-forward-and-backward--Vmlldzo4NzM4ODE)