# Hyperparameter Optimizer
References
- _Optuna simple example_ https://github.com/optuna/optuna-examples/blob/main/pytorch/pytorch_simple.py#L71  
- _Optuna RL example_ https://github.com/optuna/optuna-examples/blob/main/rl/sb3_simple.py#L114
- _Hugging Face policy gradient_ https://huggingface.co/learn/deep-rl-course/unit4/hands-on

In [None]:
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install tensorflow
!pip install numpy
!pip install torch
!pip install -U scikit-learn
!pip install optuna
!pip install stable-baselines3[extra]
!pip install cmaes

In [1]:
import os
import gymnasium as gym
from collections import deque
from typing import Any
from typing import Dict

import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import optuna
from optuna.trial import TrialState

# Load environment
env = gym.make("ALE/MsPacman-ram-v5")

In [2]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(50);

In [3]:
default_hyperparams = {
    "epoch": 100,  # max number of episodes per optimization trial
#     "n_training_episodes": 5000,  TODO: Delete
    "max_t": 5000,  # max number of steps per trial
#     "env_id": "ALE/MsPacman-ram-v5",  TODO: Delete
    "state_space": 128,  # RAM data for Atari console during game
    "action_space": 5,  # No-op, up, right, left, down
}

In [4]:
# Based off Optuna RL example code
# Changes by CS 175 project group: hyperparameters being sampled
def sample_hyperparams(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for hyperparameters."""
    gamma = trial.suggest_float("gamma", 0.99, 1, log=True)
    n_layers = trial.suggest_int("n_layers", 1, 10)
    h_size = trial.suggest_int("h_size", 4, 1024)
    dropout = trial.suggest_float("dropout", 0.0, 0.7, log=False)
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    step_reward = trial.suggest_float("step_reward", -10, 0, log=False)
    step_reward_multiplier = trial.suggest_float("step_reward_multiplier", 1, 1.1, log=True)
    ghost_reward = trial.suggest_int("ghost_reward", -1000, 1000)
    # optimizer = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    optimizer = "SGD"

    # Display true values.
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_layers_", n_layers)
    trial.set_user_attr("h_size_", h_size)
    trial.set_user_attr("dropout_", dropout)
    trial.set_user_attr("lr_", lr)
    trial.set_user_attr("optimizer_", optimizer)
    trial.set_user_attr("step_reward_", step_reward)
    trial.set_user_attr("step_reward_multiplier", step_reward_multiplier)
    trial.set_user_attr("ghost_reward_", ghost_reward)

    return {
        "n_layers": n_layers,
        "h_size": h_size,
        "dropout": dropout,
        "gamma": gamma,
        "lr": lr,
        "step_reward": step_reward,
        "step_reward_multiplier": step_reward_multiplier,
        "ghost_reward": ghost_reward,
        "optimizer": optimizer
    }

In [5]:
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - class inherits nn.Sequential rather than nn.Module
#   - change to constructor method and deletion of explicitly defined forward method
class Policy(nn.Sequential):
  def __init__(self, n_layers, h_size, dropout, s_size, a_size):
    layers = []

    in_features = s_size
    for i in range(n_layers):
      layers.append(nn.Linear(in_features, h_size))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout))

      in_features = h_size
    layers.append(nn.Linear(in_features, a_size))
    layers.append(nn.LogSoftmax(dim=1))

    super().__init__(*layers)

  def act(self, state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.forward(state).cpu()
    m = Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)

In [6]:
# Contains policy trainer from Hugging Face policy gradient code
# Changes by CS 175 project group: 
#   - changes to reward for training
#   - ensure changes to reward doesn't affect score output
#   - added Optuna methods to evaluate episodes and prune trials if needed
#   - cut out portions from original code not needed by trainer
def train(trial, policy, optimizer, epoch, max_t, gamma, step_reward, ghost_reward, step_reward_multiplier):
    for i_epoch in range(epoch):
        saved_log_probs = []
        rewards = []
        state,game_env = env.reset()
        
        step_num = 0
        score_adjustments = 0
        cur_step_reward = step_reward

        for t in range(max_t):
            old_game_env = game_env

            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _, game_env = env.step(action)
            
            # Equal reward for eating ghost
            if reward // 100 == 2:
              reward = reward - 200 + ghost_reward
              score_adjustments += 200 - ghost_reward
            elif reward // 100 == 4:
              reward = reward - 400 + ghost_reward
              score_adjustments += 400 - ghost_reward
            elif reward // 100 == 8:
              reward = reward - 800 + ghost_reward
              score_adjustments += 800 - ghost_reward
            elif reward // 100 == 16:
              reward = reward - 1600 + ghost_reward
              score_adjustments += 1600 - ghost_reward
                
            if reward % 100 == 10:
                cur_step_reward = step_reward
            else:
                cur_step_reward *= step_reward_multiplier
                
            reward += step_reward
            score_adjustments -= step_reward
            
            rewards.append(reward)

            if done:
                break

        final_score = sum(rewards) + score_adjustments

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        ## We compute this starting from the last timestep to the first, to avoid redundant computations

        ## appendleft() function of queues appends to the position 0
        ## We use deque instead of lists to reduce the time complexity

        for t in range(n_steps)[::-1]:
          disc_return_t = rewards[t] + gamma * (disc_return_t if t + 1 < n_steps else 0)
          returns.appendleft(disc_return_t)

        ## standardization for training stability
        eps = np.finfo(np.float32).eps.item()

        ## eps is added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        trial.report(final_score, i_epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return final_score

In [7]:
# Based off Optuna simple example code
# Changes by CS 175 project group: 
#   - replaced original policy with policy for Ms Pacman
#   - consolidated training code into separate function (previous code box)
def objective(trial):
    hyperparameters = {**default_hyperparams, **sample_hyperparams(trial)}

    # Generate the model.
    policy = Policy(hyperparameters["n_layers"], hyperparameters["h_size"],
                    hyperparameters["dropout"], hyperparameters["state_space"],
                    hyperparameters["action_space"]).to(device)

    # Generate the optimizers.
    optimizer_name = hyperparameters["optimizer"]
    optimizer = getattr(optim, optimizer_name)(policy.parameters(), lr=hyperparameters["lr"])

    score = train(trial, policy, optimizer, hyperparameters["epoch"],
                  hyperparameters["max_t"], hyperparameters["gamma"],
                  hyperparameters["step_reward"], hyperparameters["ghost_reward"],
                  hyperparameters["step_reward_multiplier"])

    return score

In [8]:
# Create an Optuna study
# Study info will be saved at path given to "storage" parameter as .db file
study = optuna.create_study(study_name="MsPacMan_study_1201_1643", storage="sqlite:///MsPacMan_study_1201_1643.db", 
                            direction="maximize", 
                            # Recommend default sampler and pruner for <1000 trials
                            # Comment out following two lines to use default sampler and pruner
                            sampler=optuna.samplers.CmaEsSampler(consider_pruned_trials=False), 
#                             pruner=optuna.pruners.HyperbandPruner()
                           )

[I 2023-12-01 19:07:54,633] A new study created in RDB with name: MsPacMan_study_1201_1643


In [None]:
# # Load saved study
# study = optuna.load_study(MsPacMan_study="test", storage="sqlite:///MsPacMan_study.db")

In [None]:
# Start Optuna study
# show_progress_bar=True wouldn't work on Jupyter Notebook without installing Google Colab package
# n_jobs: number of parallel jobs
study.optimize(objective, n_trials=None, timeout=3600, n_jobs=5, gc_after_trial=True, show_progress_bar=False)

[I 2023-12-01 19:13:23,122] Trial 0 finished with value: 190.00000000000261 and parameters: {'gamma': 0.9963613921555072, 'n_layers': 10, 'h_size': 798, 'dropout': 0.23246628577821934, 'lr': 2.1691046430021075e-05, 'step_reward': -3.3678081050894146, 'step_reward_multiplier': 1.040410122340053, 'ghost_reward': -38}. Best is trial 0 with value: 190.00000000000261.
[I 2023-12-01 19:13:23,749] Trial 1 finished with value: 49.99999999999986 and parameters: {'gamma': 0.993667637112442, 'n_layers': 10, 'h_size': 1020, 'dropout': 0.6944159830209767, 'lr': 0.00033806255918187233, 'step_reward': -0.1146722468481709, 'step_reward_multiplier': 1.0125097420611682, 'ghost_reward': -10}. Best is trial 0 with value: 190.00000000000261.
[I 2023-12-01 19:13:25,115] Trial 3 finished with value: 59.99999999999966 and parameters: {'gamma': 0.9910252416242034, 'n_layers': 4, 'h_size': 518, 'dropout': 0.029878411801336446, 'lr': 0.03980724305105555, 'step_reward': -7.17671484634767, 'step_reward_multiplier'

[I 2023-12-01 19:27:55,937] Trial 94 pruned. 
[I 2023-12-01 19:28:00,331] Trial 95 pruned. 
[I 2023-12-01 19:28:04,772] Trial 96 pruned. 
[I 2023-12-01 19:28:09,413] Trial 97 pruned. 
[I 2023-12-01 19:28:17,605] Trial 98 pruned. 
[I 2023-12-01 19:28:21,471] Trial 99 pruned. 
[I 2023-12-01 19:28:29,683] Trial 100 pruned. 
[I 2023-12-01 19:28:44,542] Trial 101 pruned. 
[I 2023-12-01 19:28:48,089] Trial 102 pruned. 
[I 2023-12-01 19:28:52,147] Trial 103 pruned. 
[I 2023-12-01 19:28:56,760] Trial 104 pruned. 
[I 2023-12-01 19:29:01,126] Trial 105 pruned. 
[I 2023-12-01 19:29:05,267] Trial 106 pruned. 
[I 2023-12-01 19:29:09,431] Trial 107 pruned. 
[I 2023-12-01 19:29:13,504] Trial 108 pruned. 
[I 2023-12-01 19:29:18,302] Trial 109 pruned. 
[I 2023-12-01 19:29:26,876] Trial 110 pruned. 
[I 2023-12-01 19:29:31,125] Trial 111 pruned. 
[I 2023-12-01 19:29:35,214] Trial 46 finished with value: 39.999999999999545 and parameters: {'gamma': 0.9949977093602713, 'n_layers': 4, 'h_size': 466, 'dropou

[I 2023-12-01 19:41:50,005] Trial 196 finished with value: 50.00000000000034 and parameters: {'gamma': 0.9973897694026614, 'n_layers': 5, 'h_size': 829, 'dropout': 0.3687570309245692, 'lr': 0.004454036644410375, 'step_reward': -6.019554824854563, 'step_reward_multiplier': 1.0556393912948214, 'ghost_reward': -519}. Best is trial 5 with value: 709.99999999997.
[I 2023-12-01 19:41:50,664] Trial 202 pruned. 
[I 2023-12-01 19:41:53,942] Trial 203 pruned. 
[I 2023-12-01 19:41:54,702] Trial 204 pruned. 
[I 2023-12-01 19:41:58,587] Trial 205 pruned. 
[I 2023-12-01 19:41:59,253] Trial 206 pruned. 
[I 2023-12-01 19:42:02,976] Trial 207 pruned. 
[I 2023-12-01 19:42:03,722] Trial 208 pruned. 
[I 2023-12-01 19:42:07,244] Trial 209 pruned. 
[I 2023-12-01 19:42:08,042] Trial 210 pruned. 
[I 2023-12-01 19:42:12,421] Trial 212 pruned. 
[I 2023-12-01 19:42:14,617] Trial 197 finished with value: 230.0 and parameters: {'gamma': 0.996578809080259, 'n_layers': 5, 'h_size': 435, 'dropout': 0.4474248690076819

In [None]:
# Recommended hyperparameters from Optuna study
# Exact code from Optuna simple example
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value:  ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Importance evaluation for each hyperparameter from Optuna study
print("Importances:")
for key, value in optuna.importance.get_param_importances(study).items():
  print(key, ":", value)