Notebook covers:
- Implementation of DQN using Gymnasium's Cart Pole v1 environment
- Includes personal notes at each step to increase intuition and understanding of how reinforcement learning works
- Timeline and reasoning of DQN Improvements (basic DQN -> implementing replay Buffer -> target network)

In [4]:
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
#Neural Network as model

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim,64), #map # of states to 64 dimensions
            nn.ReLU(), #introduce non-linearity by turning all negative values into 0
            nn.Linear(64,action_size), #map 64 dimensions to # of actions
        )
    def forward(self,x):
        return self.net(x)


In [None]:
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = DQN(state_size,action_size)
optimizer = optim.Adam(model.parameters(),lr = 0.001)
gamma = 0.99

results = []
for episode in tqdm(range(1000), desc="Training Episodes", unit="ep"):
    state, _ = env.reset()
    state = torch.FloatTensor(state)
    done = False
    episode_reward = 0
    
    while not done:
        
        q_values = model(state)
        #Greedy epsilon
        if random.uniform(0,1) > 0.1: #Set at 10% chance for exploration
            action = torch.argmax(q_values).item()
        else:
            action = env.action_space.sample()

        next_state, reward, termination, truncation, _ = env.step(action)
        done = termination or truncation
        next_state = torch.FloatTensor(next_state)
        episode_reward += reward
        
        current_q = model(state)[action]
        
        #Make sure that we don't update weights of the actual model
        with torch.no_grad():
            max_next_q = torch.max(model(next_state)) #Given the current choice, we look at the next state to determine if we made the most informed choice
            target_q = reward + (gamma * max_next_q * (1 - done)) #Since there's some discrepancy between current state and reality of next state, we update loss to make current closer to truth 
        loss = nn.MSELoss()(current_q, target_q)
        #Note: Since loss is more about minimizing diff between reality and expected rewards, loss isn't a good rep of how good an agent performs

        #Update loss given the information we have
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        state = next_state
    results.append(episode_reward)

In [17]:
#Printing results
for i in range(0,1000,100):
    avg = sum(results[i:i+100]) / len(results[i:i+100])
    print(f"Average Time from episode {i}-{i+100}: {avg}\n")

Average Time from episode 0-100: 9.87

Average Time from episode 100-200: 10.96

Average Time from episode 200-300: 12.93

Average Time from episode 300-400: 14.98

Average Time from episode 400-500: 17.28

Average Time from episode 500-600: 25.43

Average Time from episode 600-700: 27.14

Average Time from episode 700-800: 55.27

Average Time from episode 800-900: 105.91

Average Time from episode 900-1000: 130.5



In [18]:
#Code to save a trained agent run into video folder
from gymnasium.wrappers import RecordVideo

video_env = gym.make("CartPole-v1", render_mode="rgb_array")
video_env = RecordVideo(video_env, video_folder="./videos", episode_trigger=lambda x: True,name_prefix="basic_dqn_cartpole")

state, _ = video_env.reset()
done = False

while not done:
    state_tensor = torch.FloatTensor(state)
    action = torch.argmax(model(state_tensor)).item()
    state, reward, terminated, truncated, _ = video_env.step(action)
    done = terminated or truncated

video_env.close()

#Notes:
#Watching the agent, the agent just leans to the right or left
#Due to sequential correlation, the agent leans to the right instead of trying to balance
#One way to address this issue is to use replay buffer

  logger.warn(


In [19]:
from collections import deque
#Implementation of Replay Buffer
class ReplayBuffer():
    def __init__(self,capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self,size):
        batch = random.sample(self.buffer,size)
        state, action, reward, next_state, done = zip(*batch)
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

In [9]:
memory = ReplayBuffer(10000)
batch_size = 32
rb_env = gym.make("CartPole-v1")
rb_state_size = rb_env.observation_space.shape[0]
rb_action_size = rb_env.action_space.n
rb_model = DQN(rb_state_size,rb_action_size)
rb_optimizer = optim.Adam(rb_model.parameters(),lr = 0.001)
gamma = 0.99

rb_results = []
for episode in tqdm(range(1000), desc="Training Episodes", unit="ep"):
    state, _ = rb_env.reset()
    state = torch.FloatTensor(state)
    done = False
    episode_reward = 0
    
    while not done:
        q_values = rb_model(state)
        if random.uniform(0,1) > 0.1: #Set at 10% chance for exploration
            action = torch.argmax(q_values).item()
        else:
            action = rb_env.action_space.sample()
 
        next_state, reward, termination, truncation, _ = rb_env.step(action)
        episode_reward += reward

        
        done = termination or truncation
        next_state = torch.FloatTensor(next_state)

        memory.push(state, action, reward, next_state, done)

        #Check if buffer is greater than batch size
        if len(memory) >= batch_size:
            #Retrieve 32 (from batch_size) entries from replay buffer
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = memory.sample(batch_size)        

            #Stack states so that the neural network can process all states in the batch at one time
            batch_state = torch.stack(batch_state)
            batch_next_state = torch.stack(batch_next_state)
    
            #Convert reward and done into tensors for neural network processing
            batch_reward = torch.FloatTensor(batch_reward)
            batch_action = torch.LongTensor(batch_action)
            batch_done = torch.FloatTensor(batch_done)
    
            current_q = rb_model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                #Same thing as without relay buffer, but max_next_q has slightly different dimensions which we need to adjust for
                max_next_q = rb_model(batch_next_state).max(1)[0]
                target_q = batch_reward + (gamma * max_next_q * (1 - batch_done))
            
            loss = nn.MSELoss()(current_q, target_q)        
            rb_optimizer.zero_grad()
            loss.backward()
            rb_optimizer.step()
        
        state = next_state
    rb_results.append(episode_reward)

Training Episodes: 100%|███████████████████████████████████████████████████████████| 1000/1000 [07:37<00:00,  2.19ep/s]


In [20]:
#Testing replay buffer results

for i in range(0,1000,100):
    avg = sum(rb_results[i:i+100]) / len(rb_results[i:i+100])
    print(f"Average Time from episode {i}-{i+100}: {avg}\n")
#Results can become worse over time, as bad performance can affect agent internal weights

Average Time from episode 0-100: 15.18

Average Time from episode 100-200: 113.82

Average Time from episode 200-300: 223.99

Average Time from episode 300-400: 453.87

Average Time from episode 400-500: 240.8

Average Time from episode 500-600: 341.47

Average Time from episode 600-700: 281.6

Average Time from episode 700-800: 185.08

Average Time from episode 800-900: 359.94

Average Time from episode 900-1000: 183.61



In [59]:
#Saving replay buffer into a video file

from gymnasium.wrappers import RecordVideo

rb_video_env = gym.make("CartPole-v1", render_mode="rgb_array")
rb_video_env = RecordVideo(rb_video_env, video_folder="./videos", episode_trigger=lambda x: True, name_prefix="replay_buffer_dqn_cartpole")

state, _ = rb_video_env.reset()
done = False

while not done:
    state_tensor = torch.FloatTensor(state)
    action = torch.argmax(rb_model(state_tensor)).item()
    state, reward, terminated, truncated, _ = rb_video_env.step(action)
    done = terminated or truncated

rb_video_env.close()

In [10]:
#Implementing Target Network
#By having two DQN nets, we can mitigate the moving goalpost problem where the goal and weights are updated in a singular network
#This can help solve our problem where our agent was optimizing going only right
#Using same code, but adding comments to new lines where we implement target network
tn_env = gym.make("CartPole-v1")
tn_state_size = tn_env.observation_space.shape[0]
tn_action_size = tn_env.action_space.n
policy = DQN(tn_state_size,tn_action_size)
target = DQN(tn_state_size,tn_action_size)
target.load_state_dict(policy.state_dict())
optimizer = optim.Adam(policy.parameters(),lr = 0.001)
gamma = 0.99

memory = ReplayBuffer(10000)
batch_size = 32
total_steps = 0
tn_results = []

epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995


for episode in tqdm(range(1000), desc="Training Episodes", unit="ep"):
    state, _ = tn_env.reset()
    state = torch.FloatTensor(state)
    done = False
    episode_reward = 0
    
    while not done:
        total_steps += 1
        
        q_values = policy(state)
        if random.uniform(0, 1) > epsilon:  #Use decaying epsilon to discourage exploration as time increases, hoping that the agent knows more for exploitation
            action = torch.argmax(q_values).item()
        else:
            action = tn_env.action_space.sample()
 
        next_state, reward, termination, truncation, _ = tn_env.step(action)
        done = termination or truncation
        next_state = torch.FloatTensor(next_state)

        episode_reward += reward
        
        memory.push(state, action, reward, next_state, done)

        #Check if buffer is greater than batch size
        if len(memory) >= batch_size:
            #Retrieve 32 (from batch_size) entries from replay buffer
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = memory.sample(batch_size)        

            #Stack states so that the neural network can process all states in the batch at one time
            batch_state = torch.stack(batch_state)
            batch_next_state = torch.stack(batch_next_state)
    
            #Convert reward and done into tensors for neural network processing
            batch_reward = torch.FloatTensor(batch_reward)
            batch_action = torch.LongTensor(batch_action)
            batch_done = torch.FloatTensor(batch_done)
    
            current_q = policy(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                #Same thing as without relay buffer, but max_next_q has slightly different dimensions which we need to adjust for
                max_next_q = target(batch_next_state).max(1)[0]
                target_q = batch_reward + (gamma * max_next_q * (1 - batch_done))
            
            loss = nn.MSELoss()(current_q, target_q)        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #We still need to update our target network so that it doesn't become outdated and drag down the policy network
            #However, we just update it at a slower pace. In this case every 100 steps
            if total_steps % 50 == 0:
                target.load_state_dict(policy.state_dict())
        
        state = next_state
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
    tn_results.append(episode_reward)

Training Episodes: 100%|███████████████████████████████████████████████████████████| 1000/1000 [10:11<00:00,  1.63ep/s]


In [23]:
#Testing replay buffer results

for i in range(0,1000,100):
    avg = sum(tn_results[i:i+100]) / len(tn_results[i:i+100])
    print(f"Average Time from episode {i}-{i+100}: {avg}\n")

Average Time from episode 0-100: 19.65

Average Time from episode 100-200: 260.97

Average Time from episode 200-300: 414.13

Average Time from episode 300-400: 170.06

Average Time from episode 400-500: 482.59

Average Time from episode 500-600: 346.47

Average Time from episode 600-700: 374.54

Average Time from episode 700-800: 276.14

Average Time from episode 800-900: 435.08

Average Time from episode 900-1000: 458.76



In [14]:
#Saving replay buffer + target network into a video file
from gymnasium.wrappers import RecordVideo

tn_video_env = gym.make("CartPole-v1", render_mode="rgb_array")
tn_video_env = RecordVideo(tn_video_env, video_folder="./videos", episode_trigger=lambda x: True, name_prefix="target_network_dqn_cartpole")

state, _ = tn_video_env.reset()
done = False

while not done:
    state_tensor = torch.FloatTensor(state)
    action = torch.argmax(policy(state_tensor)).item()
    state, reward, terminated, truncated, _ = tn_video_env.step(action)
    done = terminated or truncated

tn_video_env.close()

  logger.warn(


Notes and insights:
1. More episodes doesn't necessarily mean higher performing agent
2. At certain episode #s, simpler approaches seem to work better i.e. basic DQN vs relay buffer powered DQN

Possible paths to explore:
1. Different episode numbers for each method to determine peak effectiveness (episodes = n)
2. Reducing or increasing batch size for replay buffer (batch_size = n)
3. Reducing or increasing how frequent target network updates (1:n steps)