[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/reinforcement_learning/ping_pong_proximal_gradient.ipynb)

In [33]:
# !pip install "gym[accept-rom-license, atari]"

In [36]:
import pickle

import numpy as np
import torch
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


# **Atari Evironment**

In [37]:
env = gym.make("PongNoFrameskip-v4")
observation = env.reset()

# **Pre-Processing**

In [38]:
def preprocessing(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2 --- (80, 80)
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1 -- still of shape (80, 80)
  return torch.tensor(I.astype(float).ravel()).unsqueeze(0)

# **Modelling**

In [39]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(80 * 80 * 1, 200)
        self.fc2 = nn.Linear(200, 1)

    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# **Training**

In [40]:
model = MLP().to(device)
bce_loss = nn.BCELoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [41]:
def discount_rewards_v1(r):
    """ take torch tensor of shape (batch, reward) and compute discounted reward """
    discounted_r = torch.zeros_like(r).to(device)
    running_add = 0.0

    for t in reversed(range(0, r.size(0))):

        if r[t] != 0.0:
          running_add = 0.0 # reset the sum, since this was a game boundary (pong specific!)

        running_add = r[t] + 0.99*running_add
        discounted_r[t] = running_add

    return discounted_r

In [42]:
cnt_matches = 0
reward_sum = 0
episode_number = 0

while True:

    x = preprocessing(observation).to(device)
    aprob = model(x)

    # sampling
    action = 2 if np.random.uniform() < aprob.item() else 3 # flipping a random coin

    # y is "fake label" -- considering this as a label for this observation --- to move "UP" y = 1
    y = 1 if action == 2 else 0
    y = torch.tensor([y], dtype=torch.float).unsqueeze(0).to(device) # (batch, target)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action) # done=True when episode will end, an episode equals to one full game played until one wins 21 matches.
    reward_sum += reward

    # storing reward of each match
    if reward != 0: #i.e. 1 match of this episode ended and hence if we lost we will get -1 and if we won we will get +1
      cnt_matches += 1
      print(f'ep {episode_number}: (match {cnt_matches}) game finished, reward: {reward}')

    reward = torch.tensor([reward], dtype=torch.float).unsqueeze(0).to(device) # (batch, reward)


    if cnt_matches == 0:
      rs = reward
    else:
      rs = torch.cat((rs, reward), dim=0).to(device) # creating batch of rewards of a each match -- (batch, target)

    if done: # one episode / game finished i.e. you or the opponent got to 21 points because a TT game is of 21-21 points
        episode_number += 1

        discounted_rs = discount_rewards_v1(rs) # rs -- torch.size((batch, rewards))
        discounted_rs = (discounted_rs - torch.mean(discounted_rs)) / torch.std(discounted_rs) # discounted_rs like weights for loss of each batch

        # Compute the loss for each element
        loss_per_element = bce_loss(aprob, y) # e.g. batch_probs - shape: (30, 1), ys - shape: (30, 1), loss_per_element - (30, 1)
        weighted_loss = loss_per_element * discounted_rs.to(device) # e.g. discounted_rs - shape: (30, 1), weighted_loss - (30, 1)
        loss = weighted_loss.mean()
        print("loss :", loss.item())

        # updating model parameters after every game/episode, instead of every game you can update after every 2 game or 3 game or ...
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # book-keeping
        print(f'resetting env. episode reward total was {reward_sum}')
        print(f'total matches played in this episode: {cnt_matches}, Note: To change episode, one player should win 21 matches.')

        if episode_number  == 30:
            break

        reward_sum = 0
        cnt_matches = 0
        observation = env.reset() # reset env

ep 0: (match 1) game finished, reward: -1.0
ep 0: (match 2) game finished, reward: -1.0
ep 0: (match 3) game finished, reward: -1.0
ep 0: (match 4) game finished, reward: -1.0
ep 0: (match 5) game finished, reward: -1.0
ep 0: (match 6) game finished, reward: -1.0
ep 0: (match 7) game finished, reward: -1.0
ep 0: (match 8) game finished, reward: -1.0
ep 0: (match 9) game finished, reward: -1.0
ep 0: (match 10) game finished, reward: 1.0
ep 0: (match 11) game finished, reward: -1.0
ep 0: (match 12) game finished, reward: 1.0
ep 0: (match 13) game finished, reward: -1.0
ep 0: (match 14) game finished, reward: -1.0
ep 0: (match 15) game finished, reward: -1.0
ep 0: (match 16) game finished, reward: -1.0
ep 0: (match 17) game finished, reward: -1.0
ep 0: (match 18) game finished, reward: -1.0
ep 0: (match 19) game finished, reward: -1.0
ep 0: (match 20) game finished, reward: -1.0
ep 0: (match 21) game finished, reward: -1.0
ep 0: (match 22) game finished, reward: -1.0
ep 0: (match 23) game