[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/reinforcement_learning/ping_pong_dqn.ipynb)

In [1]:
#!pip install "gym[accept-rom-license, atari]"

import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
from collections import deque

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
device = "cuda" if torch.cuda.is_available else "cpu"
print(device)

# Atari Environment

In [None]:
env = gym.make("PongNoFrameskip-v4")
print(env.observation_space.shape)
print(env.action_space.n)

# Modelling (Deep Q Network)

In [None]:
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).to(device)

In [None]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(210*160*3, 128), nn.ReLU(), # cause width = 210, height = 160, rgb_channels = 3
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, 6) # 6 actions are as follows :  NOOP (do nothing) || FIRE || RIGHT || LEFT || RIGHTFIRE || LEFTFIRE
        )
      
    def forward(self, x):
        x = nn.Flatten()(x)
        return self.layers(x)
    
    def act(self, state):
        state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        q_value = self.forward(state)
        action  = q_value.max(1)[1].data[0]
        return action

'''
class CnnDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(CnnDQN, self).__init__()
        
        self.input_shape = input_shape
        self.num_actions = num_actions
        
        self.features = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),
            nn.Linear(512, self.num_actions)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
    
    def feature_size(self):
        return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
    
    def act(self, state):
        state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
        q_value = self.forward(state)
        action  = q_value.max(1)[1].data[0]
        return action
'''

In [18]:
model = DQN().to(device) # CnnDQN(env.observation_space.shape, env.action_space.n).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# Training

In [None]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)
    
# Instantiating Replay Buffer
replay_buffer = ReplayBuffer(100000) #replay_buffer = ReplayBuffer(1000)

In [None]:
def compute_td_loss(batch_size = 32):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    q_values      = model(state)
    next_q_values = model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + 0.99 * next_q_value * (1 - done)
    
    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
num_frames = 1400000 # 10000

all_rewards = []
episode_reward = 0

state = env.reset()
for frame_idx in range(1, num_frames + 1):
    action = model.act(state)
    
    next_state, reward, done, _ = env.step(action.cpu().item()) # converting action from tensor to integer
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > 10000:
        loss = compute_td_loss()
        print(loss.cpu().item())