In [16]:
import retro
import time
import torch

torch.cuda.device_count()

1

In [17]:
#env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')
#env.close()

In [18]:
#env.action_space.sample()

In [19]:
def run(env):
    obs, info = env.reset()
    terminated = False
    
    for game in range(1):
        while not terminated:
            if terminated:
                obs, info = env.reset()
            env.render()
            obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
            if reward != 0:
                print(reward)

In [20]:
from gymnasium import Env
from gymnasium.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt

In [21]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)

        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis',
                               use_restricted_actions=retro.Actions.FILTERED)
        
    def reset(self, seed=None):
        obs, info = self.game.reset(seed=seed)
        
        obs = self.preprocess(obs)
        self.previous_frame = obs

        self.score = 0

        return obs, info
        
    def step(self, action):
        obs, reward, terminated, truncated, info = self.game.step(action)

        obs = self.preprocess(obs)
        
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        # plain score-based reward already implemented in default stable-retro config
        # modify to include health
        # score = info["score"]
        score = (
            info["score"]
            + (info["health"] - info["enemy_health"])
            + (info["matches_won"] - info["enemy_matches_won"]) * 1000
        )
        reward = score - self.score
        self.score = score
        
        return frame_delta, reward, terminated, truncated, info

    def render(self, *args, **kwargs):
        self.game.render(*args, **kwargs)

    def preprocess(self, img):
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84, 84, 1))
        return channels

    def seed(self, seed):
        pass
    
    def close(self):
        self.game.close()
    

In [22]:
env = StreetFighter()

In [23]:
env.action_space.sample()

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1], dtype=int8)

In [24]:
#run(env)

In [25]:
env.close()

# Hyperparameter tuning

In [26]:
import os

import optuna

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env

In [27]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [28]:
def optimize_ppo(trial, n_cpus):
    batch_size = 64
    n_batches = trial.suggest_int('n_batches', 32, 128)
    return {
        'policy': trial.suggest_categorical('policy', ['CnnPolicy', 'MlpPolicy']),
        'batch_size': batch_size,
        'n_steps': n_batches * batch_size,
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-4, log=True),
        'clip_range': trial.suggest_float('clip', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 0.99)
    }

In [29]:
# https://stable-baselines3.readthedocs.io/en/master/guide/examples.html
# multiprocessing-unleashing-the-power-of-vectorized-environments
def optimize_agent(trial):
    env = None
    try:
        n_cpus = 4
        model_params = optimize_ppo(trial, n_cpus)

        #env = StreetFighter()
        # https://github.com/Farama-Foundation/stable-retro/blob/master/retro/examples/ppo.py
        env = SubprocVecEnv([lambda: Monitor(StreetFighter(), LOG_DIR) for cpu in range(n_cpus)])
        #env = Monitor(env, LOG_DIR)
        #env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        # https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html
        model = PPO(env = env, tensorboard_log=LOG_DIR, verbose=0, **model_params, device = 'cuda')
        model.learn(total_timesteps=30000)
        
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    except Exception as e:
        print(e)
        return -1000
    finally:
        if env is not None:
            env.close()
    

In [30]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=100, n_jobs=1)

[I 2024-04-20 15:14:32,072] A new study created in memory with name: no-name-b9dc71df-f83c-4c90-9359-e91db739237b
Process ForkServerProcess-27:
Process ForkServerProcess-26:
Process ForkServerProcess-28:
Process ForkServerProcess-25:
Process ForkServerProcess-23:
Process ForkServerProcess-22:
Process ForkServerProcess-21:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ulkryxx/.local/lib/python3.10/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 33, in _worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.10/multiprocessing/con

KeyboardInterrupt: 