In [None]:
#Policy based reinforcement learning allows agents to learn based on current policy instead of learning action values

In [5]:
#Packages needed
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

In [34]:
#Define policy
class VPG(nn.Module):
    def __init__(self,state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64,64),
            nn.ReLU(),
            nn.Linear(64,action_dim)
        )
    def forward(self, x):
        logits = self.net(x)
        return logits
        
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        logits = self.forward(state)
        dist = torch.distributions.Categorical(logits = logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob

#Define training loop
def get_rewards_tg(rewards, gamma = 0.99):
    discounted_return = 0
    rewards_to_go = []
    for r in reversed(rewards):
        discounted_return = r + gamma * discounted_return
        #What we're doing at each step is adding the current reward to all previous rewards, which are multiplied by a factor to represent reward decay 
        rewards_to_go.insert(0, discounted_return) 
    return torch.tensor(rewards_to_go)
    
    
    
epochs = 100
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
policy = VPG(state_size,action_size)
optimizer = optim.Adam(policy.parameters(), lr = 0.003)


batch_size = 5

for episode in tqdm(range(epochs), desc = "Training Epochs: ", unit = "ep"):
    

    batch_log_probs = []
    batch_rewards = []
    batch_episode_rewards = []

    #We want to get a batch of episodes before updating the model due to high variability of episodes.
    #This reduces noise of very lucky and unlucky episodes that can mess up agent weights.
    for _ in range(batch_size):
        state, _ = env.reset()
        done = False
        log_probs = []
        rewards = []

    #Gather results from one entire episode (different than DQN where each action makes the agent reevaluate)
        while not done:
            action, log_prob = policy.get_action(state)
            
            next_state, reward, termination, truncation, _ = env.step(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            done = termination or truncation
    
            #NOTE: Don't change next_state into FloatTensor because get_action() expects a numpy array
            state = next_state
            
        
        rtg = get_rewards_tg(rewards)
        

        #Update our batch with this episode's log_probs and rewards
        batch_log_probs.extend(log_probs)
        batch_rewards.extend(rtg)    
        batch_episode_rewards.append(sum(rewards))


    batch_log_probs = torch.stack(batch_log_probs).squeeze()
    batch_rewards = torch.tensor(batch_rewards, dtype=torch.float32)

    #Currently, in our batch of rewards, the rewards are just a scalar value, and don't actually hold any representation of how good an action was given the associated reward
    #By standardizing the reward values, we can actually distinguish between good and bad actions
    
    batch_rewards = (batch_rewards - batch_rewards.mean()) / (batch_rewards.std() + 1e-9)  #The offset of 1e-9 is a failsafe in case standard deviation is 0

    #Update weights:
    #After gathering results and rewards, we want to update the agent with policy gradient theorem
    #The policy gradient theorem = average value (E) of [summation from 0 to T of policy function * Advantage]
    #Whole equation can be found here: https://spinningup.openai.com/en/latest/algorithms/vpg.html#background    
    #Due to advantage function needing something beyond VPG representation, we need a substitution for the advantage function, using rewards to go (rtg)
    #By taking the mean of rtg * log_probs, we can represent the policy gradient theorem
    #Why -rtg*log_probs? Since loss.backward() performs gradient descent, we need gradient ascent in order to maximize reward, so we reverse the direction the gradient propagates
    #detach prevents 
    
    loss = -(batch_log_probs * batch_rewards).mean()
    
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=1.0)

    optimizer.step()
    if episode % 20 == 0:
        avg_reward = sum(batch_episode_rewards) / batch_size  # ← CHANGE THIS
        print(f"Episode {episode} | Avg Reward: {avg_reward:.2f} | Episodes: {batch_episode_rewards}")

Training Epochs:   1%|▊                                                                                      | 1/100 [00:00<00:29,  3.41ep/s]

Episode 0 | Avg Reward: 28.00 | Episodes: [19.0, 32.0, 12.0, 45.0, 32.0]


Training Epochs:  21%|██████████████████                                                                    | 21/100 [00:05<00:24,  3.19ep/s]

Episode 20 | Avg Reward: 48.20 | Episodes: [37.0, 39.0, 41.0, 28.0, 96.0]


Training Epochs:  41%|███████████████████████████████████▎                                                  | 41/100 [00:18<00:47,  1.25ep/s]

Episode 40 | Avg Reward: 87.20 | Episodes: [106.0, 125.0, 90.0, 62.0, 53.0]


Training Epochs:  61%|████████████████████████████████████████████████████▍                                 | 61/100 [00:52<01:35,  2.44s/ep]

Episode 60 | Avg Reward: 289.60 | Episodes: [401.0, 332.0, 255.0, 199.0, 261.0]


Training Epochs:  81%|█████████████████████████████████████████████████████████████████████▋                | 81/100 [01:43<00:40,  2.15s/ep]

Episode 80 | Avg Reward: 224.40 | Episodes: [257.0, 249.0, 217.0, 191.0, 208.0]


Training Epochs: 100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:24<00:00,  1.44s/ep]


In [31]:
from gymnasium.wrappers import RecordVideo

# After training, wrap your environment with RecordVideo
test_env = gym.make("CartPole-v1", render_mode="rgb_array")  # Note: rgb_array, not human
test_env = RecordVideo(
    test_env, 
    video_folder="./videos",  # Where to save videos
    episode_trigger=lambda x: True,  # Record every episode
    name_prefix="vpg-cartpole-vpg"
)

for episode in range(5):
    state, _ = test_env.reset()
    done = False
    total_reward = 0
    
    while not done:
        action, _ = policy.get_action(state)
        state, reward, termination, truncation, _ = test_env.step(action)
        total_reward += reward
        done = termination or truncation
    
    print(f"Episode {episode + 1}: {total_reward} steps")

test_env.close()
print("Videos saved to ./videos/")

  logger.warn(


Episode 1: 331.0 steps
Episode 2: 118.0 steps
Episode 3: 500.0 steps
Episode 4: 405.0 steps
Episode 5: 413.0 steps
Videos saved to ./videos/


Errors I ran into when coding implementation:
- Converting next_state into a tensor, when get_action expects a numpy array. Make sure to keep nparray and tensor data types consistent.
- When doing batches of epsiodes, you can't just collect 10 episodes worth of rewards and log_probs, and perform loss on that.
- This assumes that all rewards and log_probs are one episode. To actually implement episode batching, you need to store them in a bucket.

Dealing with low reward outcomes (from most effective to least):
- Batching episodes increased performance drastically. The reasoning here is that episodes are highly variable, so updating the model weights per episode can lead to inefficient performance. By batching, we can have better training runs where the impact of outlier episodes are relatively mitigated.
Gradient clipping means that the agent policy won't be subject to large updates, which can happen when gradients become very large due to extreme log probabilities. For instance, if a low probability action is sampled from the categorical distribution of logits, it will generate high gradient values, which can cause the model to update drastically due to log(small #) being of large magnitude. This would cause unlikely actions to disproportionally impact our model learning.
- Learning rates (LR) are dependent on episode batch size. For this case, with no batching, a lower LR performed slightly better than high LR, but with batching, a higher LR performed better.  (For 100 batches of 5 episodes, the agent went from averaging 0.4 seconds / batch with LR of 0.003 to 3.5 seconds/batch with LR of 0.01 (this is with no human rendering time overhead)).
