In [8]:
from gymnasium import Env
from gymnasium.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt
import retro
import time
import torch

torch.cuda.device_count()

1

In [9]:
def run(env):
    obs, info = env.reset()
    terminated = False
    
    for game in range(1):
        while not terminated:
            if terminated:
                obs, info = env.reset()
            env.render()
            obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
            if reward != 0:
                print(reward)

In [10]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)

        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis',
                               use_restricted_actions=retro.Actions.FILTERED)
        
    def reset(self, seed=None):
        obs, info = self.game.reset(seed=seed)
        
        obs = self.preprocess(obs)
        self.previous_frame = obs

        self.score = 0

        return obs, info
        
    def step(self, action):
        obs, reward, terminated, truncated, info = self.game.step(action)

        obs = self.preprocess(obs)
        
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        # plain score-based reward already implemented in default stable-retro config
        # modify to include health
        # score = info["score"]
        score = (
            info["score"]
            + (info["health"] - info["enemy_health"])
            + (info["matches_won"] - info["enemy_matches_won"]) * 1000
        )
        reward = score - self.score
        self.score = score
        
        return frame_delta, reward, terminated, truncated, info

    def render(self, *args, **kwargs):
        self.game.render(*args, **kwargs)

    def preprocess(self, img):
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84, 84, 1))
        return channels

    def seed(self, seed):
        pass
    
    def close(self):
        self.game.close()
    

In [11]:
import os
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env

In [12]:
def optimize_agent(trial):
    env = None
    try:
        n_cpus = 4
        model_params = optimize_ppo(trial, n_cpus)

        #env = StreetFighter()
        # https://github.com/Farama-Foundation/stable-retro/blob/master/retro/examples/ppo.py
        env = SubprocVecEnv([lambda: Monitor(StreetFighter(), LOG_DIR) for cpu in range(n_cpus)])
        #env = Monitor(env, LOG_DIR)
        #env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        # https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html
        device = torch.device('cuda:0')
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params, device = device)
        model.learn(total_timesteps=30000)
        
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        print(e)
        return -1000
    finally:
        if env is not None:
            env.close()

[Parameter Policies](https://stable-baselines.readthedocs.io/en/master/modules/policies.html):

* CnnPolicy: Policy object that implements actor critic, using a CNN (the nature CNN)
* CnnLstmPolicy: Policy object that implements actor critic, using LSTMs with a CNN feature extraction
* CnnLnLstmPolicy: Policy object that implements actor critic, using a layer normalized LSTMs with a CNN feature extraction
* MlpPolicy: Policy object that implements actor critic, using a MLP (2 layers of 64)
* MlpLstmPolicy: Policy object that implements actor critic, using LSTMs with a MLP feature extraction
* MlpLnLstmPolicy: Policy object that implements actor critic, using a layer normalized LSTMs with a MLP feature extraction
    

In [13]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

n_policy = ['CnnPolicy', 'MlpPolicy']
n_learning_rate = [0.1, 0.01, 0.001, 0.0001, 0.00001]
batch_size = 64
n_batches = [16, 32, 64, 128, 256]
n_epochs = [50, 100, 500, 1000, 5000, 10000]
n_procs = 4

## Hyperparameter Testing: Policy, Learning Rate, Batches, and Steps 

In [None]:
params = []
for policy in n_policy:
    for learning_rate in n_learning_rate:
        for batches in n_batches:
            for epochs in n_epochs:
                start_time = time.time()
                vec_env = make_vec_env("CartPole-v1", n_envs=4)
                env = SubprocVecEnv([lambda: Monitor(StreetFighter(), LOG_DIR) for proc in range(n_procs)])
                env = VecFrameStack(env, 4, channels_order='last')
                        # https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html
                device = torch.device('cuda:0')
                n_steps = batch_size * batches
                model = PPO(policy = policy, env = env, learning_rate = learning_rate, n_steps = n_steps, batch_size = batch_size,
                            n_epochs = epochs, tensorboard_log=LOG_DIR, verbose=0, device = device)
                model.learn(total_timesteps=25000)
        
                mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
                env.close()
    
                hyper_ps = [policy, learning_rate, batches, epochs, mean_reward]
                params.append(hyper_ps)
                SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(hyper_ps))
                model.save(SAVE_PATH)
                elapsed_time = time.time() - start_time
                
                print(f'finished architecture {hyper_ps} at {elapsed_time} seconds.')

finished architecture ['CnnPolicy', 0.1, 16, 50, -2000.0] at 524.5853402614594 seconds.
finished architecture ['CnnPolicy', 0.1, 16, 100, -2000.0] at 645.3203458786011 seconds.


In [None]:
params

## Hyperparameter Testing: Gamma, Gae Lambda, and Clip Size