# Q-Learning with Q*bert

CS 258

Hao Feng

861090340

## Setup

In [None]:
# for compatibility
# !apt update && apt install cuda-11-8

In [None]:
import torch
import os
if torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"]="0"
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print('Found device at: {}'.format(device))

Found device at: cuda:0


In [None]:
!pip install swig
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import numpy as np
import random
from collections import deque
from gymnasium.wrappers.record_video import RecordVideo
from torch.utils.tensorboard import SummaryWriter

In [None]:
%load_ext tensorboard

  and should_run_async(code)


In [None]:
train_writer = SummaryWriter(log_dir='tensorboard/qbert')

## Q Network

In [None]:
class QNetwork(nn.Module):
    def __init__(self,state_dim,action_dim,hidden_dims,kernel_size,stride):
        super(QNetwork, self).__init__()
        layers = []
        input_channels = state_dim[-1]
        h,w = state_dim[0],state_dim[1]
        for dim in hidden_dims:
            layers.append(nn.Conv2d(input_channels,dim,kernel_size=kernel_size,stride=stride))
            layers.append(nn.ReLU())
            input_channels = dim
            h = (h-kernel_size)//stride+1
            w = (w-kernel_size)//stride+1

        layers.append(nn.Flatten())
        flattened = input_channels*h*w
        layers.append(nn.Linear(flattened, action_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, state):
        # print(state.shape)
        return self.model(state)

## Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self,capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self,state,action,reward,next_state,done):
        self.buffer.append((state,action,reward,next_state,done))

    def sample(self, batch_size):
        state,action,reward,next_state,done = zip(*random.sample(self.buffer,batch_size))
        return (np.array(state),np.array(action),np.array(reward),np.array(next_state),np.array(done))

    def __len__(self):
        return len(self.buffer)


## Agents

In [None]:
class DQNAgent:
    def __init__(self,config):
        self.device = config['device']
        self.state_dim = config['state_dim']
        self.action_dim = config['action_dim']
        self.hidden_dims = config['hidden_dims']
        self.kernel_size = config['kernel_size']
        self.stride = config['stride']

        self.lr = config['lr']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.replay_buffer = ReplayBuffer(config['buffer_capacity'])
        self.batch_size = config['batch_size']

        self.eps_max = config['eps_max']
        self.eps_min = config['eps_min']
        self.eps_decay = config['eps_decay']
        self.steps_done = 0

        self.q_network = QNetwork(self.state_dim,self.action_dim,self.hidden_dims,self.kernel_size,self.stride).to(self.device)
        self.q_target = QNetwork(self.state_dim,self.action_dim,self.hidden_dims,self.kernel_size,self.stride).to(self.device)
        self.q_target.load_state_dict(self.q_network.state_dict())

        self.optimizer = optim.Adam(self.q_network.parameters(),lr=self.lr)

    def select_action(self, state):
        self.steps_done += 1
        self.epsilon = max(self.eps_max*(self.eps_decay-self.steps_done)/self.eps_decay,self.eps_min)

        if random.random() > self.epsilon:
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
            with torch.no_grad():
                return self.q_network(state).argmax(dim=1).item()
        else:
            # random action otherwise
            return random.randrange(self.q_network.model[-1].out_features)

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        state,action,reward,next_state,done = self.replay_buffer.sample(self.batch_size)

        state = torch.FloatTensor(state).to(self.device)
        action = torch.LongTensor(action).to(self.device).unsqueeze(-1)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        current_q_values = self.q_network(state).gather(1,action).squeeze(-1)

        # target q values determined by target q values for next states
        with torch.no_grad():
            max_next_q_values = self.q_target(next_state).max(1)[0]
            target_q_values = reward+(1-done)*self.gamma*max_next_q_values

        loss = nn.MSELoss()(current_q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # soft update
        for target_param,param in zip(self.q_target.parameters(),self.q_network.parameters()):
            target_param.data.copy_(self.tau*param.data+(1-self.tau)*target_param.data)

    def store_transition(self,state,action,reward,next_state,done):
        self.replay_buffer.push(state,action,reward,next_state,done)

class DoubleDQNAgent:
    def __init__(self,config):
        self.device = config['device']
        self.state_dim = config['state_dim']
        self.action_dim = config['action_dim']
        self.hidden_dims = config['hidden_dims']
        self.kernel_size = config['kernel_size']
        self.stride = config['stride']

        self.lr = config['lr']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.replay_buffer = ReplayBuffer(config['buffer_capacity'])
        self.batch_size = config['batch_size']

        self.eps_max = config['eps_max']
        self.eps_min = config['eps_min']
        self.eps_decay = config['eps_decay']
        self.steps_done = 0

        self.q_network = QNetwork(self.state_dim,self.action_dim,self.hidden_dims,self.kernel_size,self.stride).to(self.device)
        self.q_target = QNetwork(self.state_dim,self.action_dim,self.hidden_dims,self.kernel_size,self.stride).to(self.device)
        self.q_target.load_state_dict(self.q_network.state_dict())

        self.optimizer = optim.Adam(self.q_network.parameters(),lr=self.lr)

    def select_action(self, state):
        self.steps_done += 1
        self.epsilon = max(self.eps_max*(self.eps_decay-self.steps_done)/self.eps_decay,self.eps_min)

        if random.random() > self.epsilon:
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
            with torch.no_grad():
                return self.q_network(state).argmax(dim=1).item()
        else:
            # random action otherwise
            return random.randrange(self.q_network.model[-1].out_features)

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        state,action,reward,next_state,done = self.replay_buffer.sample(self.batch_size)

        state = torch.FloatTensor(state).to(self.device)
        action = torch.LongTensor(action).to(self.device).unsqueeze(-1)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        current_q_values = self.q_network(state).gather(1,action).squeeze(-1)

        # online network for action select, target net for evaluation
        with torch.no_grad():
            next_actions = self.q_network(next_state).argmax(1,keepdim=True)
            max_next_q_values = self.q_target(next_state).gather(1,next_actions).squeeze(-1)
            target_q_values = reward+(1-done)*self.gamma*max_next_q_values

        loss = nn.MSELoss()(current_q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # soft update
        for target_param,param in zip(self.q_target.parameters(),self.q_network.parameters()):
            target_param.data.copy_(self.tau*param.data+(1-self.tau)*target_param.data)

    def store_transition(self,state,action,reward,next_state,done):
        self.replay_buffer.push(state,action,reward,next_state,done)

## DQN Run

In [None]:
# for compatibility
# torch.backends.cudnn.enabled = False

In [None]:
env = gym.make('ALE/Qbert-v5',render_mode='rgb_array')
env = RecordVideo(env,'./videos_dqn',episode_trigger = lambda x: x%20==0)
state_dim = env.observation_space.shape
action_dim = env.action_space.n

config = {
    'state_dim': state_dim,
    'action_dim': action_dim,
    'hidden_dims': [16, 32],
    'lr': 1e-4,
    'gamma': 0.99,
    'tau': 0.005,
    'buffer_capacity': 40000,
    'batch_size': 32,
    'eps_max': 1.0,
    'eps_min': 0.01,
    'eps_decay': 10000,
    'kernel_size': 8,
    'stride': 4,
    'device': device,
}

agent = DQNAgent(config)

num_episodes = 1000
max_timesteps = 1000

for episode in range(num_episodes):
    state,_ = env.reset()
    state = state.transpose(2,0,1)
    episode_reward = 0

    for t in range(max_timesteps):
        action = agent.select_action(state)
        next_state,reward,done,_,_ = env.step(action)
        next_state = next_state.transpose(2,0,1)
        agent.store_transition(state,action,reward,next_state,done)

        state = next_state
        episode_reward += reward
        agent.update()

        if done:
            break

    print(f"Episode {episode}, Reward: {episode_reward}")
    train_writer.add_scalar('Performance/episodic_return_dqn', episode_reward, episode)

env.close()
train_writer.close()

## Double DQN Run

In [None]:
# for compatibility
# torch.backends.cudnn.enabled = False

In [None]:
env = gym.make('ALE/Qbert-v5',render_mode='rgb_array')
env = RecordVideo(env,'./videos_doubledqn',episode_trigger = lambda x: x%20==0)
state_dim = env.observation_space.shape
action_dim = env.action_space.n

config = {
    'state_dim': state_dim,
    'action_dim': action_dim,
    'hidden_dims': [16, 32],
    'lr': 1e-4,
    'gamma': 0.99,
    'tau': 0.005,
    'buffer_capacity': 40000,
    'batch_size': 32,
    'eps_max': 1.0,
    'eps_min': 0.01,
    'eps_decay': 10000,
    'kernel_size': 8,
    'stride': 4,
    'device': device,
}

agent = DoubleDQNAgent(config)

num_episodes = 1000
max_timesteps = 1000

for episode in range(num_episodes):
    state,_ = env.reset()
    state = state.transpose(2,0,1)
    episode_reward = 0

    for t in range(max_timesteps):
        action = agent.select_action(state)
        next_state,reward,done,_,_ = env.step(action)
        next_state = next_state.transpose(2,0,1)
        agent.store_transition(state,action,reward,next_state,done)

        state = next_state
        episode_reward += reward
        agent.update()

        if done:
            break

    print(f"Episode {episode}, Reward: {episode_reward}")
    train_writer.add_scalar('Performance/episodic_return_doubledqn', episode_reward, episode)

env.close()
train_writer.close()

## Save/Load Model

In [None]:
torch.save(agent.q_network.state_dict(),'doubledqn_q_network_'+str(episode)+'.pt')
torch.save(agent.q_target.state_dict(),'doubledqn_q_target_'+str(episode)+'.pt')

import pickle
with open('doubledqn_'+str(episode)+'.pkl','wb') as f:
    pickle.dump(agent.replay_buffer,f)

In [None]:
agent.q_network.load_state_dict(torch.load('doubledqn_q_network_394.pt'))
agent.q_target.load_state_dict(torch.load('doubledqn_q_target_394.pt'))

import pickle
with open('doubledqn_394.pkl','rb') as f:
    agent.replay_buffer = pickle.load(f)

## Results

In [None]:
%tensorboard --logdir='tensorboard/qbert'