In [2]:
import os
import datetime

import numpy as np 
import gymnasium as gym
import ale_py

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    SummaryWriter = None  # type: ignore[misc, assignment]

gym.register_envs(ale_py)

env = gym.make('PongDeterministic-v4')
observation, info = env.reset()
print(observation.shape)

(210, 160, 3)


In [12]:
# hyperparameters
BATCH_SIZE = 4 # every how many episodes to do a param update?
GAMMA = 0.99 # discount factor for reward
DECAY_RATE = 0.99  # decay factor for RMSProp leaky sum of grad^2
HIDDEN_LAYER_NEURONS = 200
INPUT_DIMENSIONS = 80 * 80 # state size passed to neural network model
LR = 1e-3 # learning rate
EPSILON = 1e-7 #epsilon in RMSProp formula
ENTROPY = 0.01

ALGO="adam"
SAVE_PATH = f"./models/checkpoint-pong_conv_entropy_reducing_{ALGO}.pth"
RESUME = True # resume from previous checkpoint?

## Define Model, Optimizer, Loss with Torch


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Prioritize device: CUDA > MPS > CPU
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("CUDA is available. Using CUDA.")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
    print("MPS backend is available. Using MPS.")
else:
    DEVICE = torch.device("cpu")
    print("Neither CUDA nor MPS is available. Using CPU.")

# It works better on CPU, then on MPS, keep this way
# DEVICE = torch.device("cpu")

class Model(nn.Module):

    def __init__(self, filters_multiplier = 1):
        """Initialize parameters and build model.
        """
        super(Model, self).__init__()

        self.conv1 = nn.Conv2d(1, 16*filters_multiplier, kernel_size=8, stride=4) # 19x19x16 / params 8*8*1*16 = 1024
        # (80+2*0-8)/4 + 1 = 19

        self.conv2 = nn.Conv2d(16*filters_multiplier, 32*filters_multiplier, kernel_size=5, stride=2) # 8*8*32 / params 5*5*16*32 = 12800
        # (19 +2*0-5)/2 + 1 = 8
        self.size=8*8*32*filters_multiplier


        # 2 fully connected layers
        self.fc1 = nn.Linear(self.size, 200) #8*8*32*200 = 409600
        self.fc2 = nn.Linear(200, 1) # 200*1 = 200
    
    def forward(self, state):
        """Build a network that maps state -> probability action up."""
        x = F.relu(self.conv1(state))
        x = F.relu(self.conv2(x))
        
        x = x.view(-1, self.size)
        x = F.relu(self.fc1(x))
        
        output = F.sigmoid(self.fc2(x))
        return output

#improvement suggestion 
# 1st layer 32 filters /8*8*1*32=2048
# 2nd layer 64 filters/ 5*5*32*64=51200 
# fc layers 8*8*64*200 = 819200 + 200
model = Model(filters_multiplier=2)

model.to(DEVICE)
if ALGO == "rmsprop":
    # we updated parameters 2nd time LR to 0.001 from 0.0001, EPSILON to  1e-8 from 1e-5 and alpha stays same, we don't have momentum like adam though
    optimizer = optim.RMSprop(model.parameters(), lr=LR, eps=EPSILON, alpha=DECAY_RATE)
else: 
    optimizer = torch.optim.Adam(model.parameters(), lr=7e-4)

MPS backend is available. Using MPS.


## Image PreProcessing 

In [5]:
def preprocess(image):
    """ Pre-process 210x160x3 uint8 frame into 6400 (80x80) 1D float vector. """
    image = torch.Tensor(image).to(DEVICE) #converts to float 
    
    # Crop, downsample by factor of 2, and turn to grayscale by keeping only red channel
    image = image[35:195]
    image = image[::2,::2, 0]

    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else (paddles, ball) just set to 1

    return image.unsqueeze(0).float() # (1,80,80)


## Action selection and Discount Rewards

In [6]:
def choose_action(probability):
    random_value = np.random.uniform()
    if random_value < probability:
        # signifies up in openai gym
        return 2
    else:
         # signifies down in openai gym
        return 3

In [7]:
def discount_rewards(rewards, gamma):
    """ Actions you took x steps before the end result are less important to the overall result than an action you took a step ago.
    This implements that logic by discounting the reward on previous actions based on how long ago they were taken
        discounted_future_reward[t] = \sum_{k=1} discount_factor^k * reward[t+k]"""

    discounted_rewards = torch.empty(len(rewards)).to(DEVICE)
    running_add = 0
    # Compute discounted_future_reward for each timestep by iterating backwards
    # from end of episode to beginning
    for t in  range(len(rewards) - 1, -1, -1):
        if rewards[t] != 0:
            # If rewards[t] != 0, we are at game boundary (win or loss) so we
            # reset discounted_future_reward to 0 (this is pong specific!)
            running_add = 0 
        running_add = running_add * gamma + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

## Train

In [13]:
def run_episode(model, env, entropy_coefficient = 0.0):
    observation, info = env.reset()
    action_chosen_log_probs, action_chosen_probs, episode_rewards = [], [], []

    
    done = False 
    prev_processed_obs = preprocess(observation)
    reward_sum = 0
    timestep = 0

    while not done:
        processed_obs = preprocess(observation)
        # Preprocess the observation, set input to network to be difference
        # image between frames
        state = processed_obs - prev_processed_obs
        prev_processed_obs = processed_obs
        # print('State:', state.shape) 

        # calculate probability of taking action up
        up_probability = model(state).squeeze(0)
        # print(up_probability.shape)

        action = choose_action(up_probability) # toss the coin, up or down 
        # see here: http://cs231n.github.io/neural-networks-2/#losses
        # ∂L/∂f = predicted_label(0 or 1) - true_label(0 or 1) or (sigmoid(f) - y)
        # take negative of gradient to reduce error  when its added in future or -∂L/∂f = y - sigmoid(f)
        # fake_label = 1 if action == 2 else 0
        # batch_targets.append(fake_label)
        action_chosen_prob = up_probability if action ==  2 else (1 - up_probability)
        action_chosen_log_probs.append(torch.log(action_chosen_prob))
        action_chosen_probs.append(action_chosen_prob)
        
        # carry out the chosen action
        observation, reward, terminated, truncated, info = env.step(action)
        reward_sum += reward
        episode_rewards.append(reward)
        
        prev_processed_obs = processed_obs
        done = truncated or terminated
        timestep += 1

    action_chosen_log_probs = torch.cat(action_chosen_log_probs)
    episode_rewards = torch.tensor(episode_rewards, dtype=torch.float32, device=DEVICE)
    action_chosen_probs = torch.cat(action_chosen_probs)
    entropy = -(action_chosen_probs * torch.log(action_chosen_probs + 1e-10) +
        (1.0-action_chosen_probs)*torch.log(1.0-action_chosen_probs+1e-10))  # lets add entropy

    
    discounted_rewards = discount_rewards(episode_rewards, GAMMA)
    # print('Discounted:', discounted_rewards.shape)
    # Standardize the rewards to have mean 0, std. deviation 1 (helps control the gradient estimator variance).
    # It encourages roughly half of the actions to be rewarded and half to be discouraged, which
    # is helpful especially in beginning when positive reward signals are rare.
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1.0e-10)
    # print('Discounted:', discounted_rewards.shape)

    # Compute the loss for each step
    # weighted_loss = loss_per_step * discounted_rewards # e.g. discounted_rewards - shape: (20, 1), weighted_loss - (20, 1)
    # in numpy sample we use sum() of weighted loss if look at the code, we could use mean()
    # discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32, device=DEVICE)
    loss = -(discounted_rewards * action_chosen_log_probs).sum() # Take the mean of the weighted loss across steps for stabilizing gradients
    loss += -(entropy_coefficient * entropy).sum()

    return loss, reward_sum
    

def train(model, env, start_time, last_batch, tf_writer, n_episodes=100000):
        
    i_episode = 1
    batch = last_batch + 1
    entropy_coefficient = ENTROPY
    if ENTROPY:
        initial_entropy_coeff = ENTROPY
        final_entropy_coeff = 0.001
        diff_coeff = initial_entropy_coeff - final_entropy_coeff
        total_episodes = min(n_episodes, 250)  # episodes before getting to 0.001

    done = False    
    while not done:

        mean_batch_loss = 0
        mean_batch_reward = 0
        for batch_episode in range(BATCH_SIZE):
            # Run one episode
            if ENTROPY:
                progress = min(i_episode / total_episodes, 1.0)  # Ensure it's capped at 1.0
                entropy_coefficient = initial_entropy_coeff - progress * (diff_coeff)

            loss, episode_reward = run_episode(model, env, entropy_coefficient)
            mean_batch_loss += loss / BATCH_SIZE
            mean_batch_reward += episode_reward / BATCH_SIZE

            # Boring book-keeping
            print(f'Episode {i_episode} reward total was {episode_reward}')
            i_episode += 1

        # Check if loss is NaN before backprop
        if torch.isnan(mean_batch_loss):
            print(f"NaN detected in loss at batch {batch}. Terminating training.")
            break  # Stop the entire training process if NaN is detected

        # Backprop after `batch_size` episodes
        optimizer.zero_grad()
        mean_batch_loss.backward()
        optimizer.step()

        # Batch metrics and tensorboard logging
        print(f'Batch: {batch}, mean loss: {mean_batch_loss:.2f}, '
              f'mean reward: {mean_batch_reward:.2f}')
        tf_writer.add_scalar('mean loss', mean_batch_loss.detach().item(), batch)
        tf_writer.add_scalar('mean reward', mean_batch_reward, batch)

        if batch % 25 == 0: 
            print('Saving checkpoint...')
            save_dict = {
                'model_weights': model.state_dict(),
                'start_time': start_time,
                'last_batch': batch
            }
            torch.save(save_dict, SAVE_PATH)
        
        if i_episode >= n_episodes:
            done = True
        
        batch += 1

In [None]:
# Set up tensorboard logging
if RESUME: 
    print('Loading from checkpoint...')
  
    save_dict = torch.load(SAVE_PATH)

    model.load_state_dict(save_dict['model_weights'])
    start_time = save_dict['start_time']
    last_batch = save_dict['last_batch']
else:
    start_time = datetime.datetime.now().strftime("%H.%M.%S-%m.%d.%Y")
    last_batch = -1

tf_writer = SummaryWriter(
    os.path.join('runs', start_time + f'-CONV-ENTROPY-REDUCING-{ALGO}'))

train(model, env, start_time, last_batch, tf_writer, n_episodes=500)  # so main error we had was in discount calculatiosn when we gave there in (n,1) array instead of (n,)

In [18]:
import os
import imageio
import numpy as np
from IPython.display import Video, display, HTML

def record_video(env, policy, out_directory, out_name, fps=30):
    """
    Generate a replay video of the agent and display it in the notebook.
    :param env: Environment to record.
    :param policy: Policy used to determine actions.
    :param out_directory: Path to save the video.
    :param fps: Frames per second.
    """
    images = []
    done = False
    observation, _ = env.reset()
    img = env.render()
    images.append(img)
    frame1 = preprocess(observation)

    while not done:
        # Take the action that maximizes the expected future reward
        frame2 = preprocess(observation)
        # Preprocess the observation, set input to network to be difference
        # image between frames
        state = frame2 - frame1
        frame1 = frame2

        # calculate probability of taking action up
        up_probability = policy(state)
        action = choose_action(up_probability) # toss the coin, up or down 

        observation, reward, terminated, truncated, _ = env.step(action) 
        img = env.render()
        images.append(img)
        done = terminated or truncated
    
    # Save the video
    video_path = os.path.join(out_directory, out_name)
    imageio.mimsave(video_path, [np.array(img) for img in images], fps=fps)
    
    # Display the video in Jupyter notebook
    display(Video(video_path, embed=True, width=640, height=480))

In [19]:
env = gym.make('PongDeterministic-v4', render_mode='rgb_array')
save_dict = torch.load('./models/checkpoint-pong_conv_rmsprop.pth')

model.load_state_dict(save_dict['model_weights'])
record_video(env, model, './videos', 'output_pong_conv_rmsprop.mp4')

