In [17]:
#Packages needed
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

In [18]:
#Define value and policy network

#Value network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1) #1 is the expected reward
        )
    def forward(self, state):
        value = self.net(state)
        return value.squeeze(-1) 

#Policy network
class VPG(nn.Module):
    def __init__(self,state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,action_dim)
        )
    def forward(self, x):
        logits = self.net(x)
        return logits
        
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        logits = self.forward(state)
        dist = torch.distributions.Categorical(logits = logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob
        
def get_rewards_tg(rewards, gamma = 0.99):
    discounted_return = 0
    rewards_to_go = []
    for r in reversed(rewards):
        discounted_return = r + gamma * discounted_return
        #What we're doing at each step is adding the current reward to all previous rewards, which are multiplied by a factor to represent reward decay 
        rewards_to_go.insert(0, discounted_return) 
    return torch.tensor(rewards_to_go)

In [16]:
#Training loop
import torch.nn.functional as F

epochs = 100
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
policy = VPG(state_size,action_size)
value = ValueNetwork(state_size)

policy_optimizer = optim.Adam(policy.parameters(), lr = 0.003)
value_optimizer = optim.Adam(value.parameters(), lr = 0.005)

batch_size = 5

for episode in tqdm(range(epochs), desc = "Training Epochs: ", unit = "ep"):
    batch_log_probs = []
    batch_rewards = []
    batch_episode_rewards = []
    batch_values = []
    #update policy with batches of episodes
    for _ in range(batch_size):
        state, _ = env.reset()
        done = False
        log_probs = []
        rewards = []
        states = []

        while not done:
            action, log_prob = policy.get_action(state)
            next_state, reward, termination, truncation, _ = env.step(action)
            done = termination or truncation
            
            rewards.append(reward)
            log_probs.append(log_prob)
            states.append(state)
    
            state = next_state 
            
        rtg = get_rewards_tg(rewards)   
        states_tensor = torch.FloatTensor(np.array(states))
        expected_value = value(states_tensor)
        
        batch_log_probs.extend(log_probs)
        batch_rewards.extend(rtg)
        batch_values.append(expected_value.squeeze()) #Right now expected_value is a 
        batch_episode_rewards.append(sum(rewards))

    batch_log_probs = torch.stack(batch_log_probs).squeeze()
    batch_rewards = torch.tensor(batch_rewards, dtype=torch.float32)
    batch_values = torch.cat(batch_values)

    batch_rewards = (batch_rewards - batch_rewards.mean()) / (batch_rewards.std() + 1e-9)
    advantages = batch_rewards - batch_values.detach()
    
    #normalize advantages? We normalized batch_rewards so I'm assuming we should do that for advantage to
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-9)


    #How to update value network?
    value_loss = F.mse_loss(batch_values.squeeze(), batch_rewards.squeeze())
    value_optimizer.zero_grad()
    value_loss.backward()
    torch.nn.utils.clip_grad_norm_(value.parameters(), max_norm=1.0)

    value_optimizer.step()
    
    #Update policy
    policy_loss = -(batch_log_probs * advantages).mean()
    policy_optimizer.zero_grad()
    policy_loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=1.0)
    policy_optimizer.step()

    if episode % 20 == 0:
        avg_reward = sum(batch_episode_rewards) / batch_size
        print(f"\nEpoch {episode}")
        print(f"  Avg Reward: {avg_reward:.2f}")
        print(f"  Policy Loss: {policy_loss.item():.4f}")
        print(f"  Value Loss: {value_loss.item():.4f}")  # Should decrease over time!
        print(f"  Mean Value Prediction: {batch_values.mean().item():.2f}")

Training Epochs:   2%|█▎                                                               | 2/100 [00:00<00:38,  2.58ep/s]


Epoch 0
  Avg Reward: 18.80
  Policy Loss: 0.0094
  Value Loss: 1.0190
  Mean Value Prediction: 0.14


Training Epochs:  21%|█████████████▍                                                  | 21/100 [00:06<00:30,  2.58ep/s]


Epoch 20
  Avg Reward: 32.40
  Policy Loss: -0.0239
  Value Loss: 0.8734
  Mean Value Prediction: -0.02


Training Epochs:  41%|██████████████████████████▏                                     | 41/100 [00:23<01:07,  1.15s/ep]


Epoch 40
  Avg Reward: 137.00
  Policy Loss: -0.0191
  Value Loss: 0.4853
  Mean Value Prediction: 0.07


Training Epochs:  61%|███████████████████████████████████████                         | 61/100 [00:59<01:40,  2.58s/ep]


Epoch 60
  Avg Reward: 377.20
  Policy Loss: 0.0036
  Value Loss: 0.6839
  Mean Value Prediction: -0.30


Training Epochs:  81%|███████████████████████████████████████████████████▊            | 81/100 [02:02<00:58,  3.08s/ep]


Epoch 80
  Avg Reward: 428.00
  Policy Loss: -0.0259
  Value Loss: 0.6697
  Mean Value Prediction: 0.50


Training Epochs: 100%|███████████████████████████████████████████████████████████████| 100/100 [03:11<00:00,  1.91s/ep]


Insights/mistakes:

To implement vanilla policy gradient (VPG) with baseline, I expanded on the code from the base VPG implementation. I tried to code this without looking at any existing implementations, and I decided to look at Spinning Up RL and online resources to understand the high-level process of adding a value function. 

After reading, I first added a value network, and then modified my training loop to compute the advantage function using Q(s,a) - V(s)). I originally added an arbitrary lighter neural network for my value network, but this turned out to be not a good decision. Since value functions are supposed to be very good at evaluating values, it's better to have a more dense neural network that can predict values better. This value function would be used along with the existing rewards to go to create the advantage function (Q(s,a) - V(s) where Q(s,a) = rewards to go, and V(s) is estimted value). This is an improvement on base VPG, which represents the advantage function using just Q(s,a)/rewards to go without a value estimator. Doing this subtraction of the baseline reduces variance.


When training the updated training loop, I ran into a problem where my value loss was absurdly high. After looking more at the code, I noticed that I didn't normalize the rewards. The problem with this is that a neural network is initialized to predict values near 0. This means it's hard to predict large values [0,500]. By normalizing the rewards, it would make it easier for value function/neural network to predict larger values, which are now "scaled down" to a lower range. To elaborate, by normalizing the reward to mean = 0 and std = 1, value network can learn more effectively since the range is smaller.

In addition to normalizing rewards, it's also important to normalize advantages so that it makes learning more stable across batches where rewards can differ even in the episodes within each batch.

TLDR:
- Advantage function can be represented by the value function subtracted from rewards to go, instead of just using rewards to go in base VPG implementation. By having rewards to go minus the baseline (result of value function), we can reduce variance because the model now understands what's better or worse than expected. Rather than just having a rewards number with no comparison.
- Have a value function that is of equal or higher complexity than the policy network
- Normalize rewards to lower value loss by making larger values more predictable for neural networks