In [None]:
#!pip install "gym[accept-rom-license, atari]"
import pickle

import numpy as np
import torch
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(80 * 80 * 1, 200)
        self.fc2 = nn.Linear(200, 1)

    def forward(self, x):
        # Ensure the input is in float format
        x = x.float()
        # mlp layers
        x = F.relu(self.fc1(x))  # Hidden layer (200)
        x = torch.sigmoid(self.fc2(x))  # Single output with sigmoid
        return x

class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, stride=2, padding=1)  # Downsampling
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(80 * 80 * 1, 200)
        self.fc2 = nn.Linear(200, 1)

    def forward(self, x):
        # Ensure the input is in float format
        x = x.float()
        x = F.relu(self.conv1(x))  # Downsampling (160 -> 80)
        x = F.relu(self.conv2(x))  # Same size (80 -> 80)
        x = x.view(x.size(0), -1)  # Flatten to (batch_size, 80*80*1)
        # mlp layers
        x = F.relu(self.fc1(x))  # Hidden layer (200)
        x = torch.sigmoid(self.fc2(x))  # Single output with sigmoid

        return x

In [None]:
def prepro_mlp(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2 --- (80, 80)
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1 -- still of shape (80, 80)
  return torch.tensor(I.astype(float).ravel()).unsqueeze(0)

def preprocess_v1(I):
  """ prepro 210x160x3 uint8 frame into torch (80x80) vector """
  I = I[35:195] # crop
  I = I[:,:,0] # Only take first channel
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  I = torch.tensor(I.astype(float)).unsqueeze(0).unsqueeze(0) # (1, 1, 160, 160) shape
  return I

def discount_rewards_v1(r):
    """ take torch tensor of shape (batch, reward) and compute discounted reward """
    discounted_r = torch.zeros_like(r).to(device)
    running_add = 0.0
    for t in reversed(range(0, r.size(0))):
        if r[t] != 0.0: running_add = 0.0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * 0.99 + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
env = gym.make("ALE/Pong-v5")
observation = env.reset()
prev_x = None # used in computing the difference frame
reward_sum = 0
episode_number = 0

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SmallCNN().to(device)
bce_loss = nn.BCELoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [None]:
cnt_matches = 0
while True:

    x = preprocess_v1(observation)
    aprob = model(x.to(device))
    action = 2 if np.random.uniform() < aprob.item() else 3 # flipping a random coin

    # y is "fake label" -- considering this as a label for this observation --- to move "UP" y = 1
    y = 1 if action == 2 else 0
    y = torch.tensor([y], dtype=torch.float).unsqueeze(0).to(device) # (batch, target)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action) # done=True when episode will end, an episode equals to one full game played until one wins 21 matches.
    reward_sum += reward

    # storing reward of each match
    if reward != 0: #i.e. match ended
      cnt_matches += 1
      print(f'ep {episode_number}: (match {cnt_matches}) game finished, reward: {reward}')

    reward = torch.tensor([reward], dtype=torch.float).unsqueeze(0).to(device) # (batch, reward)


    if cnt_matches == 1:
      rs = reward
    else:
      rs = torch.cat((rs, reward), dim=0).to(device) # creating batch of rewards of a each match -- (batch, target)

    if done: # one episode i.e. one game is finished
        episode_number += 1

        discounted_rs = discount_rewards_v1(rs) # rs -- torch.size((batch, rewards))
        discounted_rs = (discounted_rs - torch.mean(discounted_rs)) / torch.std(discounted_rs) # discounted_rs like weights for loss of each batch

        # Compute the loss for each element
        loss_per_element = bce_loss(aprob, y) # e.g. batch_probs - shape: (30, 1), ys - shape: (30, 1), loss_per_element - (30, 1)
        weighted_loss = loss_per_element * discounted_rs.to(device) # e.g. discounted_rs - shape: (30, 1), weighted_loss - (30, 1)
        loss = weighted_loss.mean()

        # Backward pass
        loss.backward() # store the calculated gradient of each parameter

        if episode_number % 2 == 0: # update parameters of model when episode is multiple of batch_size
            # update weights of the model
            optimizer.step()  # Update weights
            optimizer.zero_grad()  # Reset gradients, as for batch_size (e.g. 10), gradient of each episode was accumulating.
            print("loss :", loss.item())

        # boring book-keeping
        print(f'resetting env. episode reward total was {reward_sum}')
        print(f'total matches played in this episode: {cnt_matches}, Note: To change episode, one player should win 21 matches.')

        if episode_number  == 30:
            break

        reward_sum = 0
        cnt_matches = 0
        observation = env.reset() # reset env

ep 0: (match 1) game finished, reward: -1.0
ep 0: (match 2) game finished, reward: -1.0
ep 0: (match 3) game finished, reward: -1.0
ep 0: (match 4) game finished, reward: -1.0
ep 0: (match 5) game finished, reward: -1.0
ep 0: (match 6) game finished, reward: -1.0
ep 0: (match 7) game finished, reward: -1.0
ep 0: (match 8) game finished, reward: -1.0
ep 0: (match 9) game finished, reward: -1.0
ep 0: (match 10) game finished, reward: -1.0
ep 0: (match 11) game finished, reward: -1.0
ep 0: (match 12) game finished, reward: -1.0
ep 0: (match 13) game finished, reward: -1.0
ep 0: (match 14) game finished, reward: -1.0
ep 0: (match 15) game finished, reward: -1.0
ep 0: (match 16) game finished, reward: -1.0
ep 0: (match 17) game finished, reward: -1.0
ep 0: (match 18) game finished, reward: -1.0
ep 0: (match 19) game finished, reward: -1.0
ep 0: (match 20) game finished, reward: -1.0
ep 0: (match 21) game finished, reward: -1.0
resetting env. episode reward total was -21.0
total matches play