# Hyperparameter Optimizer
References
- _Optuna simple example_ https://github.com/optuna/optuna-examples/blob/main/pytorch/pytorch_simple.py#L71  
- _Optuna RL example_ https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py#L114
- _Hugging Face policy gradient_ https://huggingface.co/learn/deep-rl-course/unit4/hands-on

In [None]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install tensorflow
!pip install numpy
!pip install torch
!pip install -U scikit-learn
!pip install optuna
!pip install stable-baselines3[extra]
!pip install cmaes

In [None]:
import os
import gymnasium as gym
from collections import deque
from typing import Any
from typing import Dict
from tqdm import tqdm

import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import optuna
from optuna.trial import TrialState

# Load environment
env = gym.make("ALE/MsPacman-ram-v5")

In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(50);

In [None]:
default_hyperparams = {
    "epoch": 20,  # max number of episodes per optimization trial
#     "n_training_episodes": 5000,  TODO: Delete
    "max_t": 50000,  # max number of steps per trial
#     "env_id": "ALE/MsPacman-ram-v5",  TODO: Delete
    "state_space": 128,  # RAM data for Atari console during game
    "action_space": 5,  # No-op, up, right, left, down
}

In [None]:
# Based off Optuna RL example code
# Changes by CS 175 project group: hyperparameters being sampled
def sample_hyperparams(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for hyperparameters."""
    gamma = trial.suggest_float("gamma", 0.99995, 1, log=True)
    n_layers = trial.suggest_int("n_layers", 1, 5)
    h_size = trial.suggest_int("h_size", 4, 1024)
    dropout = trial.suggest_float("dropout", 0.0, 0.7, log=False)
    lr = trial.suggest_float("lr", 1e-6, 1e-2, log=True)
    longevity_exponential = trial.suggest_float("longevity_exponential", 1.001, 1.01, log=True)
    step_penalty_multiplier = trial.suggest_float("step_penalty_multiplier", 1, 1.1, log=True)
#     ghost_reward = trial.suggest_int("ghost_reward", -1000, 1000)
    ghost_reward = 0
    dot_extra_reward = trial.suggest_int("dot_extra_reward", 0, 20)
    energy_pill_extra_reward = trial.suggest_int("energy_pill_extra_reward", 0, 100)
    # optimizer = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    optimizer = "SGD"

    # Display true values.
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_layers_", n_layers)
    trial.set_user_attr("h_size_", h_size)
    trial.set_user_attr("dropout_", dropout)
    trial.set_user_attr("lr_", lr)
    trial.set_user_attr("longevity_exponential_", longevity_exponential)
    trial.set_user_attr("step_penalty_multiplier_", step_penalty_multiplier)
    trial.set_user_attr("ghost_reward_", ghost_reward)
    trial.set_user_attr("dot_extra_reward_", dot_extra_reward)
    trial.set_user_attr("energy_pill_extra_reward_", energy_pill_extra_reward)
    trial.set_user_attr("optimizer_", optimizer)

    return {
        "gamma": gamma,
        "n_layers": n_layers,
        "h_size": h_size,
        "dropout": dropout,
        "lr": lr,
        "longevity_exponential": longevity_exponential,
        "step_penalty_multiplier": step_penalty_multiplier,
        "ghost_reward": ghost_reward,
        "dot_extra_reward": dot_extra_reward,
        "energy_pill_extra_reward": energy_pill_extra_reward,
        "optimizer": optimizer
    }

In [None]:
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - class inherits nn.Sequential rather than nn.Module
#   - change to constructor method and deletion of explicitly defined forward method
class Policy(nn.Sequential):
  def __init__(self, n_layers, h_size, dropout, s_size, a_size):
    layers = []

    in_features = s_size
    for i in range(n_layers):
      layers.append(nn.Linear(in_features, h_size))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout))

      in_features = h_size
    layers.append(nn.Linear(in_features, a_size))
    layers.append(nn.LogSoftmax(dim=1))

    super().__init__(*layers)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

In [None]:
# Contains policy trainer from Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - changes to reward for training
#   - ensure changes to reward doesn't affect score output
#   - added Optuna methods to evaluate episodes and prune trials if needed
#   - cut out portions from original code not needed by trainer
def train(trial, policy, optimizer, epoch, max_t, gamma, ghost_reward, step_penalty_multiplier, 
          longevity_exponential=0, dot_extra_reward=0, energy_pill_extra_reward=0):
    for i_epoch in range(epoch + 1):
        saved_log_probs = []
        rewards = []
        state,game_env = env.reset()
        
        # Variables for reward changes
        step_num = 0
        score_adjustments = 0
        rewards_this_life = 0
        step_penalty = 1
        cur_step_penalty = step_penalty

        for t in range(max_t):
            old_game_env = game_env

            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _, game_env = env.step(action)
            
            # Longevity reward. More reward gathered for each life, larger reward
            if old_game_env["lives"] > game_env["lives"]:
                longevity_reward = longevity_exponential ** rewards_this_life
                rewards_this_life = 0
                reward += longevity_reward
                score_adjustments -= longevity_reward
                rewards.append(reward)
                continue
                
            reward_change = 0
            
            # Equal penalty for eating ghost
            if reward // 100 == 2:
              reward_change = reward - 200 + ghost_reward
              score_adjustments += 200 - ghost_reward
            elif reward // 100 == 4:
              reward_change = reward - 400 + ghost_reward
              score_adjustments += 400 - ghost_reward
            elif reward // 100 == 8:
              reward_change = reward - 800 + ghost_reward
              score_adjustments += 800 - ghost_reward
            elif reward // 100 == 16:
              reward_change = reward - 1600 + ghost_reward
              score_adjustments += 1600 - ghost_reward
                
            # Penalty for going many steps without eating dot
            if reward % 100 == 10:
                cur_step_penalty = step_penalty
                reward_change += dot_extra_reward
                score_adjustments -= dot_extra_reward
            elif reward % 100 == 50:
                cur_step_penalty = step_penalty
                reward_change += energy_pill_extra_reward
                score_adjustments -= energy_pill_extra_reward
            else:
                cur_step_penalty *= step_penalty_multiplier
                reward_change -= step_penalty
                score_adjustments += step_penalty
            
            rewards.append(reward + reward_change)

            if done:
                break

        final_score = sum(rewards) + score_adjustments

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        ## We compute this starting from the last timestep to the first, to avoid redundant computations

        ## appendleft() function of queues appends to the position 0
        ## We use deque instead of lists to reduce the time complexity

        for t in range(n_steps)[::-1]:
          disc_return_t = rewards[t] + gamma * (disc_return_t if t + 1 < n_steps else 0)
          returns.appendleft(disc_return_t)

        ## standardization for training stability
        eps = np.finfo(np.float32).eps.item()

        ## eps is added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        trial.report(final_score, i_epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return final_score

In [None]:
# Based off Optuna simple example code
# Changes by CS 175 project group: 
#   - replaced original policy with policy for Ms Pacman
#   - consolidated training code into separate function (previous code box)
def objective(trial):
    hyperparameters = {**default_hyperparams, **sample_hyperparams(trial)}

    # Generate the model.
    policy = Policy(hyperparameters["n_layers"], hyperparameters["h_size"],
                    hyperparameters["dropout"], hyperparameters["state_space"],
                    hyperparameters["action_space"]).to(device)

    # Generate the optimizers.
    optimizer_name = hyperparameters["optimizer"]
    optimizer = getattr(optim, optimizer_name)(policy.parameters(), lr=hyperparameters["lr"])

    score = train(trial, policy, optimizer, epoch=hyperparameters["epoch"],
                  max_t=hyperparameters["max_t"], gamma=hyperparameters["gamma"],
                  ghost_reward=hyperparameters["ghost_reward"],
                  step_penalty_multiplier=hyperparameters["step_penalty_multiplier"],
                  longevity_exponential=hyperparameters["longevity_exponential"],
                  dot_extra_reward=hyperparameters["dot_extra_reward"],
                  energy_pill_extra_reward=hyperparameters["energy_pill_extra_reward"],
                 )

    return score

In [None]:
# Create an Optuna study
# Study info will be saved at path given to "storage" parameter as .db file
study = optuna.create_study(study_name="MsPacMan_study", storage="sqlite:///MsPacMan_study.db", 
                            direction="maximize", 
                            # Recommend default sampler and pruner for <1000 trials
                            # Comment out following two lines to use default sampler and pruner
                            sampler=optuna.samplers.CmaEsSampler(consider_pruned_trials=False), 
#                             pruner=optuna.pruners.HyperbandPruner()
                           )

In [None]:
# Load saved study
study = optuna.load_study(study_name="MsPacMan_study", storage="sqlite:///MsPacMan_study.db")

In [None]:
# Start Optuna study
# show_progress_bar=True wouldn't work on Jupyter Notebook without installing Google Colab package
# n_jobs: number of parallel jobs
study.optimize(objective, n_trials=None, timeout=None, n_jobs=5, gc_after_trial=True, show_progress_bar=False)

In [21]:
# Recommended hyperparameters from Optuna study
# Exact code from Optuna simple example
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value:  ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Study statistics: 
  Number of finished trials:  4129
  Number of pruned trials:  3812
  Number of complete trials:  317
Best trial:
  Value:   1760.0
  Params: 
    gamma: 0.9999687436213692
    n_layers: 2
    h_size: 380
    dropout: 0.41350749106933754
    lr: 0.00019517929500375344
    longevity_exponential: 1.007696584564763
    step_penalty_multiplier: 1.0701682456616506
    dot_extra_reward: 14
    energy_pill_extra_reward: 24


In [22]:
# Importance evaluation for each hyperparameter from Optuna study
optuna.visualization.plot_param_importances(study)

# print("Importances:")
# for key, value in optuna.importance.get_param_importances(study).items():
#   print(key, ":", value)