## Implementing Breakout bot using Reinforcement Learning

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import gym
from stable_baselines3 import DQN
import numpy as np
import pandas as pd
import time

Defining the neural network for Q-value approximation for each state-action pair:

In [9]:
class DQNCNN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=state_size, out_channels=32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)

        self.fc_in = 64 * 7 * 7

        self.fc1 = nn.Linear(self.fc_in, 512)
        self.fc2 = nn.Linear(512, action_size)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))

        x = x.view(-1, self.fc_in)

        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

Defining the replay buffer for storing experiences and batch sampling:

In [10]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        experiences = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        return (np.vstack(states), np.array(actions), np.array(rewards),
                np.vstack(next_states), np.array(dones))
    
    def __len__(self):
        return len(self.buffer)

In [11]:
class DQNAgent:
    def __init__(self, state_size, action_size, buffer_size, batch_size, gamma=0.99, lr=0.001):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.lr = lr

        self.q_network_loc = DQNCNN(state_size, action_size)
        self.q_network_tar = DQNCNN(state_size, action_size)
        self.opt = optim.Adam(self.q_network_loc.parameters(), lr=self.lr)

        self.memory = ReplayBuffer(buffer_size)

        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network_loc.to(self.device)
        self.q_network_tar.to(self.device)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.q_network_loc.eval()

        with torch.no_grad():
            action_vals = self.q_network_loc(state)
        self.q_network_loc.train()

        if random.random() > self.epsilon:
            return np.argmax(action_vals.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):
        self.memory.add((state, action, reward, next_state, done))

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        next_states = torch.from_numpy(next_states).float().to(self.device)
        dones = torch.from_numpy(dones).float().to(self.device)

        Q_targets_next = self.q_network_tar(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        Q_exp = self.q_network_loc(states).gather(1, actions.unsqueeze(1))

        loss = nn.MSELoss()(Q_exp, Q_targets)

        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        self.q_network_tar.load_state_dict(self.q_network_loc.state_dict())

In [12]:
env = gym.make("Breakout-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
buffer_size = 1000
agent = DQNAgent(state_size, action_size, batch_size=batch_size, buffer_size=buffer_size)

  logger.warn(


In [None]:
n_episodes = 1000
max_t = 1000
target_update_freq = 10

for episode in range(n_episodes):
    state = env.reset()
    total_reward = 0
    for t in range(max_t):
        action = agent.act(state) 
        next_state, reward, done, _ = env.step(action) 
        agent.step(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        if done:
            break

    if episode % target_update_freq == 0:
        agent.update_target_model()

    print(f"Episode {episode}, Total Reward: {total_reward}")

env.close()

In [13]:
model_imp = DQN('CnnPolicy', env, verbose=1, batch_size=32, buffer_size=1000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
