# Ms. Pacman Agent
## Policy Gradient with Neural Network
References:
- _Hugging Face policy gradient_ https://huggingface.co/learn/deep-rl-course/unit4/hands-on

In [None]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install tensorflow
!pip install numpy
!pip install torch
!pip install imageio[ffmpeg]

In [None]:
# # Virtual display
# # Only needed for Google Colab
# # Code from CS 175 HW 2
# %%capture
# !apt install python-opengl
# !apt install ffmpeg
# !apt install xvfb
# !pip install pyvirtualdisplay
# !pip install pyglet==1.5.1

# from pyvirtualdisplay import Display

# virtual_display = Display(visible=0, size=(1400, 900))
# virtual_display.start()

In [None]:
import os
import gymnasium as gym
from collections import deque

import numpy as np
import matplotlib.pyplot as plt
from gymnasium import envs
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import imageio

env = gym.make("ALE/MsPacman-ram-v5")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(torch.cuda.is_available())

In [None]:
# print("_____OBSERVATION SPACE_____ \n")
# print("Observation Space", env.observation_space)
# print("Sample observation", env.observation_space.sample()) # Get a random observation

In [None]:
# print("\n _____ACTION SPACE_____ \n")
# print("Action Space Shape", env.action_space.n)
# print("Action Space Sample", env.action_space.sample()) # Take a random action

In [None]:
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group:
#   - class inherits nn.Sequential rather than nn.Module
#   - change to constructor method and deletion of explicitly defined forward method
class Policy(nn.Sequential):
  def __init__(self, n_layers, h_size, dropout, s_size, a_size):
    layers = []

    in_features = s_size
    for i in range(n_layers):
      layers.append(nn.Linear(in_features, h_size))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(0))
      in_features = h_size
    layers.append(nn.Linear(in_features, a_size))
    layers.append(nn.LogSoftmax(dim=1))

    super().__init__(*layers)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

In [None]:
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group:
#   - changes to reward for training
#   - ensure changes to reward doesn't affect score output
def reinforce(policy, optimizer, print_every, n_training_episodes, max_t, 
              gamma, ghost_reward, policy_file_name, step_penalty_multiplier, 
              longevity_exponential=0, dot_extra_reward=0, energy_pill_extra_reward=0):
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in tqdm(range(1, n_training_episodes+1)):
        saved_log_probs = []
        rewards = []
        state,game_env = env.reset()
        
        # Variables for reward changes
        step_num = 0
        score_adjustments = 0
        rewards_this_life = 0
        step_penalty = 1
        cur_step_penalty = step_penalty

        for t in range(max_t):
            old_game_env = game_env

            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _, game_env = env.step(action)
            
            # Longevity reward. More reward gathered for each life, larger reward
            if old_game_env["lives"] > game_env["lives"]:
                longevity_reward = longevity_exponential ** rewards_this_life
                rewards_this_life = 0
                reward += longevity_reward
                score_adjustments -= longevity_reward
                rewards.append(reward)
                continue
            
            reward_change = 0
            
            # Equal penalty for eating ghost
            if reward // 100 == 2:
              reward_change = reward - 200 + ghost_reward
              score_adjustments += 200 - ghost_reward
            elif reward // 100 == 4:
              reward_change = reward - 400 + ghost_reward
              score_adjustments += 400 - ghost_reward
            elif reward // 100 == 8:
              reward_change = reward - 800 + ghost_reward
              score_adjustments += 800 - ghost_reward
            elif reward // 100 == 16:
              reward_change = reward - 1600 + ghost_reward
              score_adjustments += 1600 - ghost_reward
            
            # Penalty for going many steps without eating dot
            if reward % 100 == 10:
                cur_step_penalty = step_penalty
                reward_change += dot_extra_reward
                score_adjustments -= dot_extra_reward
            elif reward % 100 == 50:
                cur_step_penalty = step_penalty
                reward_change += energy_pill_extra_reward
                score_adjustments -= energy_pill_extra_reward
            else:
                cur_step_penalty *= step_penalty_multiplier
                reward_change -= step_penalty
                score_adjustments += step_penalty
            
            rewards.append(reward + reward_change)

            if done:
                break

        reward_sum = sum(rewards) + score_adjustments
        scores_deque.append(reward_sum)
        scores.append(reward_sum)

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        ## We compute this starting from the last timestep to the first, to avoid redundant computations

        ## appendleft() function of queues appends to the position 0
        ## We use deque instead of lists to reduce the time complexity

        for t in range(n_steps)[::-1]:
          disc_return_t = rewards[t] + gamma * (disc_return_t if t + 1 < n_steps else 0)
          returns.appendleft(disc_return_t)

        ## standardization for training stability
        eps = np.finfo(np.float32).eps.item()

        ## eps is added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}\tStandard Deviation: {:.2f}'
                  .format(i_episode, np.mean(scores_deque), np.std(scores_deque)))

    torch.save(policy, policy_file_name)
    return scores

In [None]:
hyperparameters = {
    'gamma': 0.9999819193245816, 
    'n_layers': 1, 
    'h_size': 175, 
    'dropout': 0.44984866197635065, 
    'lr': 6.166629462708628e-05, 
    'longevity_exponential': 1.006491852944776, 
    'step_penalty_multiplier': 1.0386448544834312, 
    'dot_extra_reward': 13, 
    'energy_pill_extra_reward': 12,
    
    "ghost_reward": 0,
    "optimizer": "SGD",
    "policy_file_name": "Policies/MsPacManPG_optimized.pt",
    "n_training_episodes": 10000,
#     "n_evaluation_episodes": 10,
    "max_t": 50000,
    "env_id": "ALE/MsPacman-ram-v5",
    "s_size": 128,
    "a_size": 5,
}

In [None]:
# Create policy and place it to the device
policy = Policy(hyperparameters["n_layers"], hyperparameters["h_size"],
                hyperparameters["dropout"], hyperparameters["s_size"],
                hyperparameters["a_size"]).to(device)
optimizer = getattr(optim, hyperparameters["optimizer"])(policy.parameters(), lr=hyperparameters["lr"])

In [None]:
# Load saved policy
policy = torch.load(hyperparameters["policy_file_name"])
policy.eval();

In [None]:
# Train and save neural network
scores = reinforce(policy, optimizer, print_every=1000, 
                   n_training_episodes=hyperparameters["n_training_episodes"],
                   max_t=hyperparameters["max_t"],
                   gamma=hyperparameters["gamma"],
                   ghost_reward=hyperparameters["ghost_reward"],
                   policy_file_name=hyperparameters["policy_file_name"], 
                   step_penalty_multiplier=hyperparameters["step_penalty_multiplier"],
                   longevity_exponential=hyperparameters["longevity_exponential"],
                   dot_extra_reward=hyperparameters["dot_extra_reward"],
                   energy_pill_extra_reward=hyperparameters["energy_pill_extra_reward"]
                   )

In [None]:
# # Save scores to csv
# import csv
# with open("MsPacMan_training_scores.csv", 'w') as file:
#     writer = csv.writer(file)
#     writer.writerow(scores)

In [None]:
# # Plot training progress
# plt.plot(scores)
# plt.show()

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param policy: The Reinforce agent
    """
    episode_rewards = []
    for episode in tqdm(range(1, n_eval_episodes + 1)):
        state, game_env = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            new_state, reward, done, _, game_env = env.step(action)
            total_rewards_ep += reward

            if done:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [None]:
mean, std = evaluate_agent(env, hyperparameters["max_t"], 50, policy)
print(f"Mean: {mean}; Std: {std}")

In [None]:
def record_video(env, policy, out_directory_best, out_directory_worst, episodes, fps=30):
  """Save rendering of best and worst performing episode out of desired number of episodes
  """
  worst_score = 999999999
  best_score = 0
  
  for i in tqdm(range(1, episodes + 1)):
      
      done = False
      state, game_env = env.reset()
      score = 0
      last_life_frame = 0
  
      while not done:
        old_game_env = game_env
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        new_state, reward, done, _, game_env = env.step(action)

#         if old_game_env["lives"] != game_env["lives"]:
#             print("Lived for {} frames".format(old_game_env["episode_frame_number"] - 0))
#             last_life_frame = old_game_env["episode_frame_number"]

        score += reward
      
      if score < worst_score:
        worst_game = env.render()
        worst_score = score
      elif score > best_score:
        best_game = env.render()
        best_score = score
        
#       if i % 100 == 0:  
#         print("Best score is {}, as of iteration {}".format(best_score, i))
#         print("Worst score is {}, as of iteration {}".format(worst_score, i))
  print("Best score is {}".format(best_score))
  print("Worst score is {}".format(worst_score))
  imageio.mimsave(out_directory_best, best_game, fps=fps)
  imageio.mimsave(out_directory_worst, worst_game, fps=fps)

In [None]:
replay_env = gym.make("ALE/MsPacman-ram-v5", render_mode="rgb_array_list")
record_video(replay_env, policy, './MsPacMan_replay_best.mp4', './MsPacMan_replay_worst.mp4', 50)