[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/reinforcement_learning/ping_pong_dqn.ipynb)

In [13]:
# !pip install "gym[accept-rom-license, atari]"

In [14]:
import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from collections import deque

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
device = "cuda" if torch.cuda.is_available else "cpu"
print(device)

cuda


# **Atari Environment**

In [16]:
env = gym.make("PongNoFrameskip-v4")
print(env.observation_space.shape)
print(env.action_space.n)

(210, 160, 3)
6


  deprecation(
  deprecation(


In [None]:
observation = env.reset() # Reset the environment to default beginning
print(observation) # environment is defined by 4 variables here [horizontal pos, horizontal velocity, angle of pole, angular velocity]

In [None]:
# Performing 1 Random Action and seeing the change in the environment via observation value change
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
print(observation, reward, terminated, truncated , info)

In [None]:
# Performing 1000 random actions and visulizing each action using rendering in gym
for t in range(1000):
    env.render() # this wont work in google colab, you will have to run this on local as a python file and not a notebook file

    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(observation, reward, terminated, truncated , info)

# **Modelling (Deep Q Network)**

In [17]:
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).to(device)

In [32]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(210*160*3, 128), nn.ReLU(), # cause width = 210, height = 160, rgb_channels = 3
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, 6) # 6 actions are as follows :  NOOP (do nothing) || FIRE || RIGHT || LEFT || RIGHTFIRE || LEFTFIRE
        )

    def forward(self, x):
        x = nn.Flatten()(x)
        return self.layers(x)

    def act(self, state):
        state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        q_value = self.forward(state)
        action  = q_value.max(1)[1].data[0]
        return action

In [33]:
model = DQN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# **Training**

In [34]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done

    def __len__(self):
        return len(self.buffer)

# Instantiating Replay Buffer
replay_buffer = ReplayBuffer(100000)

In [35]:
def compute_td_loss(batch_size = 32):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    q_values      = model(state)
    next_q_values = model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + 0.99 * next_q_value * (1 - done)

    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss

In [39]:
num_frames = 10000  # you can choose any no here
all_rewards = []
episode_reward = 0

state = env.reset()
for frame_idx in range(0, num_frames):
    action = model.act(state)

    next_state, reward, done, _ = env.step(action.cpu().item()) # converting action from tensor to integer
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > 10000:
        loss = compute_td_loss()
        print(f'Loss for Frame #{frame_idx} : {loss.cpu().item()}')

  and should_run_async(code)
  Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).to(device)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loss for Frame #5000 : 0.15449216961860657
Loss for Frame #5001 : 0.16151349246501923
Loss for Frame #5002 : 0.08367182314395905
Loss for Frame #5003 : 0.2366870790719986
Loss for Frame #5004 : 0.2849997580051422
Loss for Frame #5005 : 0.12309281527996063
Loss for Frame #5006 : 0.048773400485515594
Loss for Frame #5007 : 0.058799050748348236
Loss for Frame #5008 : 0.11449562758207321
Loss for Frame #5009 : 0.08911733329296112
Loss for Frame #5010 : 0.024899153038859367
Loss for Frame #5011 : 0.04027814790606499
Loss for Frame #5012 : 0.14425109326839447
Loss for Frame #5013 : 0.06868695467710495
Loss for Frame #5014 : 0.4897560477256775
Loss for Frame #5015 : 0.02951447293162346
Loss for Frame #5016 : 0.07405610382556915
Loss for Frame #5017 : 0.07548925280570984
Loss for Frame #5018 : 0.10805506259202957
Loss for Frame #5019 : 0.02855478599667549
Loss for Frame #5020 : 0.037823982536792755
Loss for Frame #5021 : 0.051920