# Setup StreetFighter

In [9]:
import retro
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import cycle

In [2]:
# see all the available game
retro.data.list_games()

['1942-Nes',
 '1943-Nes',
 '3NinjasKickBack-Genesis',
 '8Eyes-Nes',
 'AaahhRealMonsters-Genesis',
 'AbadoxTheDeadlyInnerWar-Nes',
 'AcceleBrid-Snes',
 'ActRaiser2-Snes',
 'ActionPachio-Snes',
 'AddamsFamily-GameBoy',
 'AddamsFamily-Genesis',
 'AddamsFamily-Nes',
 'AddamsFamily-Sms',
 'AddamsFamily-Snes',
 'AddamsFamilyPugsleysScavengerHunt-Nes',
 'AddamsFamilyPugsleysScavengerHunt-Snes',
 'AdvancedBusterhawkGleylancer-Genesis',
 'Adventure-Atari2600',
 'AdventureIsland-GameBoy',
 'AdventureIsland3-Nes',
 'AdventureIslandII-Nes',
 'AdventuresOfBatmanAndRobin-Genesis',
 'AdventuresOfBayouBilly-Nes',
 'AdventuresOfDinoRiki-Nes',
 'AdventuresOfDrFranken-Snes',
 'AdventuresOfKidKleets-Snes',
 'AdventuresOfMightyMax-Genesis',
 'AdventuresOfMightyMax-Snes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Genesis',
 'AdventuresOfRockyAndBullwinkleAndFriends-Nes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Snes',
 'AdventuresOfStarSaver-GameBoy',
 'AdventuresOfYogiBear-Snes',
 'AeroFighters-Snes',
 

In [None]:
# python -m retro.import, use this in command in the rom folder

In [15]:
# use this command because it cant run parallel and will cause an error
env.close() 

In [None]:
# sample the observation space
env.observation_space.sample()


In [None]:
# sample the actions that are available - Multibinary
env.action_space.sample()

In [15]:
# reset game to starting state
obs = env.reset()
# flag indicating whether died/finished or not
done = False


# how many games to play
for game in range(1):
    # until not dead/finished
    while not done:
        if done:
            obs = env.reset()
        env.render()
        # while not done, render our game, and add information to env.step
        obs, reward, done, info = env.step(env.action_space.sample())


# Setup Environment

In [10]:
from gym import Env

# import for shape changer
from gym.spaces import MultiBinary, Box

# import numpy for frame delta
import numpy as np

# import cv for gray-scale
import cv2

from matplotlib import pyplot as plt

# create custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        # reduce the observation space to smaller pixel
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(
            game="StreetFighterIISpecialChampionEdition-Genesis",
            use_restricted_actions=retro.Actions.FILTERED, # only does actions that are within the game button
            state = "ryuvsryu" # initial state is ryu vs ryu
        )    
        
    def reset(self):
        # return first frame
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.enemy_health = 176
        self.health = 176
        self.left_over_enemy_health = 0
        self.left_over_health = 0
        self.consecutive_attacks = 0
        self.enemy_consecutive_attacks = 0
        return obs

    def step(self, action):
        # passes unprocessed obs, reward and takes a step
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)

        # reward for damage given, penalty for damage taken
        # extra reward for consecutive attacks, extra penalty for consecutive enemy attack
        # health starts at 176, if dead, goes to -1, then 0, then 176
        if info["enemy_health"] == 176 and info["health"] == 176:
            self.enemy_health = 176
            self.health = 176
            self.left_over_enemy_health = 0
            self.left_over_health = 0
            self.consecutive_attacks = 0
            self.enemy_consecutive_attacks = 0

        elif (info["enemy_health"] > 0 and info["health"] > 0) and (
            info["enemy_health"] < 176 or info["health"] < 176
        ):
            enemy_health_reward = self.enemy_health - info["enemy_health"]
            health_penalty = self.health - info["health"]
            if enemy_health_reward - health_penalty > 0:
                self.consecutive_attacks += 1
                self.enemy_consecutive_attacks = 0
                if self.consecutive_attacks > 1:
                    reward = (
                        enemy_health_reward
                        - health_penalty
                        + (
                            5
                            * (
                                self.consecutive_attacks
                                - self.enemy_consecutive_attacks
                            )
                        )
                    )
                else:
                    reward = enemy_health_reward - health_penalty

            elif enemy_health_reward - health_penalty < 0:
                self.consecutive_attacks = 0
                self.enemy_consecutive_attacks += 1
                if self.enemy_consecutive_attacks > 1:
                    reward = (
                        enemy_health_reward
                        - health_penalty
                        + (
                            5
                            * (
                                self.consecutive_attacks
                                - self.enemy_consecutive_attacks
                            )
                        )
                    )
                else:
                    reward = enemy_health_reward - health_penalty

            else:
                reward = enemy_health_reward - health_penalty

            self.enemy_health = info["enemy_health"]
            self.health = info["health"]

        elif info["enemy_health"] < 0:
            left_over_health = info["health"] - self.left_over_health
            enemy_health_reward = self.enemy_health - info["enemy_health"]
            self.consecutive_attacks += 1
            self.enemy_consecutive_attacks = 0
            if self.consecutive_attacks > 1:
                reward = (
                    left_over_health
                    + enemy_health_reward
                    + (5 * (self.consecutive_attacks - self.enemy_consecutive_attacks))
                )
            else:
                reward = left_over_health + enemy_health_reward
            self.enemy_health = info["enemy_health"]
            self.left_over_health = info["health"]
            self.consecutive_attacks = 0
            self.enemy_consecutive_attacks = 0

        elif info["health"] < 0:
            left_over_enemy_health = info["enemy_health"] - self.left_over_enemy_health
            health_penalty = self.health - info["health"]
            self.consecutive_attacks = 0
            self.enemy_consecutive_attacks += 1
            if self.enemy_consecutive_attacks > 1:
                reward = -(
                    left_over_enemy_health
                    + health_penalty
                    + (5 * (self.enemy_consecutive_attacks - self.consecutive_attacks))
                )
            else:
                reward = -(left_over_enemy_health + health_penalty)
            self.health = info["health"]
            self.left_over_enemy_health = info["enemy_health"]
            self.consecutive_attacks = 0
            self.enemy_consecutive_attacks = 0

        else:
            reward = 0

        self.enemy_health = info["enemy_health"]
        self.health = info["health"]

        return obs, reward, done, info

    def preprocess(self, observation):
        # gray-scaling
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # resize
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        channels = np.reshape(resize, (84, 84, 1))
        return channels

    def render(self, *arg, **kwargs):
        self.game.render()

    def close(self):
        self.game.close()

In [4]:
env.close()

In [3]:
env = StreetFighter()


# Hyperparameter Tuning

In [11]:
# Import optuna for HPO optimization frame
import optuna
# Import PPO for algos
from stable_baselines3 import PPO, A2C
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging to access rewards
from stable_baselines3.common.monitor import Monitor
# import wrappers
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import os


In [12]:
# save model and reload without retraining
LOG_DIR = './logs/'
OPT_DIR = './opt/'


In [5]:
# function to return hyperparameters - define objective function
# ppo hyperparameters to tune
# n_steps = batch size (frame) factor of 64
# gamma = discount rate for calculating returns
# learning rate = learning coefficient for optimizier
# clip_range = clipping amount for advantage calc
# gae_lambda = advantage smooth parameter

def optimize_ppo(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 0, 256),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1e-6),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.3),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, .99)
    }


In [6]:
# function to return hyperparameters - define objective function
# ppo hyperparameters to tune
# n_steps = batch size (frame) factor of 64
# gamma = discount rate for calculating returns
# learning rate = learning coefficient for optimizier
# clip_range = clipping amount for advantage calc
# gae_lambda = advantage smooth parameter


def optimize_a2c(trial):
    return {
        "n_steps": trial.suggest_int("n_steps", 0, 128),
        "gamma": trial.suggest_loguniform("gamma", 0.8, 0.9999),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-3),
        "gae_lambda": trial.suggest_uniform("gae_lambda", 0.8, 0.99),
        "ent_coef": trial.suggest_uniform("ent_coef", 0, 0),
        "vf_coef": trial.suggest_uniform("vf_coef", 0.5, 0.5),
        "max_grad_norm": trial.suggest_uniform("max_grad_norm", 0.5, 0.5),
        "rms_prop_eps": trial.suggest_uniform("rms_prop_eps", 1e-05, 1e-05),
    }

In [7]:
def optimize_agent(trial):
    # change model_params based on PPO or A2C
    model_params = optimize_a2c(trial)
    env = StreetFighter()
    env = Monitor(env, LOG_DIR)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    model = A2C('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
    model.learn(total_timesteps=300000)
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
    env.close()
    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)
    return mean_reward


In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=20)

[32m[I 2022-05-05 11:51:46,450][0m A new study created in memory with name: no-name-83840d9f-0de7-45f7-81a6-fc8e22d07101[0m
[32m[I 2022-05-05 12:08:31,396][0m Trial 0 finished with value: -941.0 and parameters: {'n_steps': 40, 'gamma': 0.9339204357275219, 'learning_rate': 0.0008096568129955273, 'gae_lambda': 0.9599343248815659, 'ent_coef': 0.0, 'vf_coef': 0.5, 'max_grad_norm': 0.5, 'rms_prop_eps': 1e-05}. Best is trial 0 with value: -941.0.[0m
[32m[I 2022-05-05 12:24:15,605][0m Trial 1 finished with value: -1515.0 and parameters: {'n_steps': 95, 'gamma': 0.9983058505768979, 'learning_rate': 0.00011008059667272203, 'gae_lambda': 0.8455438961518725, 'ent_coef': 0.0, 'vf_coef': 0.5, 'max_grad_norm': 0.5, 'rms_prop_eps': 1e-05}. Best is trial 0 with value: -941.0.[0m
[32m[I 2022-05-05 12:40:40,613][0m Trial 2 finished with value: -931.0 and parameters: {'n_steps': 128, 'gamma': 0.9175192241118756, 'learning_rate': 1.940985569890767e-05, 'gae_lambda': 0.8261502901815325, 'ent_coe

In [10]:
# see the best hyperparams
study.best_params

{'n_steps': 97,
 'gamma': 0.9123708231325064,
 'learning_rate': 1.7586069563249184e-05,
 'gae_lambda': 0.8103898505539483,
 'ent_coef': 0.0,
 'vf_coef': 0.5,
 'max_grad_norm': 0.5,
 'rms_prop_eps': 1e-05}

In [10]:
# n-step must be factor of 64 for PPO
import math
model_params = study.best_params
#n_steps_var = math.trunc(model_params["n_steps"]/64)
#model_params["n_steps"]=n_steps_var*64

NameError: name 'study' is not defined

In [5]:
model_params={
    "n_steps": 97,
    "gamma": 0.9123708231325064,
    "learning_rate": 1.7586069563249184e-09,
    "gae_lambda": 0.8103898505539483,
    "ent_coef": 0.0,
    "vf_coef": 0.5,
    "max_grad_norm": 0.5,
    "rms_prop_eps": 1e-05,
}

# Setup Callback

In [13]:
from stable_baselines3.common.callbacks import BaseCallback

In [14]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True



In [15]:
CHECKPOINT_DIR = "./train/"

In [16]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path = CHECKPOINT_DIR)

In [21]:
env.close()

# Training the Model

In [10]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [11]:
model = A2C('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)


Using cpu device
Wrapping the env in a VecTransposeImage.


In [14]:
model.load(os.path.join(OPT_DIR, "trial_11_best_model.zip"))

<stable_baselines3.a2c.a2c.A2C at 0x7f963633e278>

In [12]:
model.load('./train/best_model_9450000.zip')

<stable_baselines3.a2c.a2c.A2C at 0x7f92d32c50b8>

In [13]:
model.learn(total_timesteps=10000000, callback=callback)

Logging to ./logs/A2C_22
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.16e+03 |
|    ep_rew_mean        | -668     |
| time/                 |          |
|    fps                | 336      |
|    iterations         | 100      |
|    time_elapsed       | 28       |
|    total_timesteps    | 9700     |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -0.00322 |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 99       |
|    policy_loss        | 2.48     |
|    value_loss         | 11.9     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 6.26e+03 |
|    ep_rew_mean        | -476     |
| time/                 |          |
|    fps                | 336      |
|    iterations         | 200      |
|    time_elapsed       | 57       |
|    total_timesteps    | 19400    |
| train/     

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 6e+03    |
|    ep_rew_mean        | -484     |
| time/                 |          |
|    fps                | 336      |
|    iterations         | 1400     |
|    time_elapsed       | 403      |
|    total_timesteps    | 135800   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.00268  |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 1399     |
|    policy_loss        | -19.2    |
|    value_loss         | 45.6     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 6.02e+03  |
|    ep_rew_mean        | -491      |
| time/                 |           |
|    fps                | 336       |
|    iterations         | 1500      |
|    time_elapsed       | 431       |
|    total_timesteps    | 145500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.78e+03 |
|    ep_rew_mean        | -551     |
| time/                 |          |
|    fps                | 336      |
|    iterations         | 2700     |
|    time_elapsed       | 777      |
|    total_timesteps    | 261900   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.00344  |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 2699     |
|    policy_loss        | -17.7    |
|    value_loss         | 61.6     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.75e+03 |
|    ep_rew_mean        | -550     |
| time/                 |          |
|    fps                | 337      |
|    iterations         | 2800     |
|    time_elapsed       | 805      |
|    total_timesteps    | 271600   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.66e+03 |
|    ep_rew_mean        | -570     |
| time/                 |          |
|    fps                | 337      |
|    iterations         | 4100     |
|    time_elapsed       | 1178     |
|    total_timesteps    | 397700   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -2.56    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 4099     |
|    policy_loss        | -0.574   |
|    value_loss         | 0.00614  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.69e+03 |
|    ep_rew_mean        | -571     |
| time/                 |          |
|    fps                | 337      |
|    iterations         | 4200     |
|    time_elapsed       | 1207     |
|    total_timesteps    | 407400   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.66e+03 |
|    ep_rew_mean        | -589     |
| time/                 |          |
|    fps                | 337      |
|    iterations         | 5500     |
|    time_elapsed       | 1579     |
|    total_timesteps    | 533500   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.51    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 5499     |
|    policy_loss        | -0.498   |
|    value_loss         | 0.00832  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.65e+03 |
|    ep_rew_mean        | -589     |
| time/                 |          |
|    fps                | 337      |
|    iterations         | 5600     |
|    time_elapsed       | 1607     |
|    total_timesteps    | 543200   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.51e+03 |
|    ep_rew_mean        | -617     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 6900     |
|    time_elapsed       | 1979     |
|    total_timesteps    | 669300   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.19    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 6899     |
|    policy_loss        | -0.474   |
|    value_loss         | 0.0072   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.5e+03  |
|    ep_rew_mean        | -615     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 7000     |
|    time_elapsed       | 2008     |
|    total_timesteps    | 679000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.41e+03 |
|    ep_rew_mean        | -601     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 8200     |
|    time_elapsed       | 2351     |
|    total_timesteps    | 795400   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.21    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 8199     |
|    policy_loss        | -0.24    |
|    value_loss         | 0.00516  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.43e+03 |
|    ep_rew_mean        | -606     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 8300     |
|    time_elapsed       | 2379     |
|    total_timesteps    | 805100   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.47e+03 |
|    ep_rew_mean        | -595     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 9500     |
|    time_elapsed       | 2722     |
|    total_timesteps    | 921500   |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.86    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 9499     |
|    policy_loss        | -0.281   |
|    value_loss         | 0.00984  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.48e+03 |
|    ep_rew_mean        | -594     |
| time/                 |          |
|    fps                | 338      |
|    iterations         | 9600     |
|    time_elapsed       | 2751     |
|    total_timesteps    | 931200   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.68e+03 |
|    ep_rew_mean        | -585     |
| time/                 |          |
|    fps                | 339      |
|    iterations         | 10800    |
|    time_elapsed       | 3087     |
|    total_timesteps    | 1047600  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -2.52    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 10799    |
|    policy_loss        | -0.432   |
|    value_loss         | 0.0077   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.61e+03  |
|    ep_rew_mean        | -585      |
| time/                 |           |
|    fps                | 339       |
|    iterations         | 10900     |
|    time_elapsed       | 3115      |
|    total_timesteps    | 1057300   |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.72e+03 |
|    ep_rew_mean        | -572     |
| time/                 |          |
|    fps                | 340      |
|    iterations         | 12100    |
|    time_elapsed       | 3451     |
|    total_timesteps    | 1173700  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.638    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 12099    |
|    policy_loss        | 0.231    |
|    value_loss         | 0.00277  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.71e+03 |
|    ep_rew_mean        | -572     |
| time/                 |          |
|    fps                | 340      |
|    iterations         | 12200    |
|    time_elapsed       | 3479     |
|    total_timesteps    | 1183400  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.89e+03 |
|    ep_rew_mean        | -550     |
| time/                 |          |
|    fps                | 322      |
|    iterations         | 13500    |
|    time_elapsed       | 4058     |
|    total_timesteps    | 1309500  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.0108   |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 13499    |
|    policy_loss        | -8.5     |
|    value_loss         | 15.3     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.88e+03 |
|    ep_rew_mean        | -548     |
| time/                 |          |
|    fps                | 322      |
|    iterations         | 13600    |
|    time_elapsed       | 4086     |
|    total_timesteps    | 1319200  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.98e+03 |
|    ep_rew_mean        | -531     |
| time/                 |          |
|    fps                | 324      |
|    iterations         | 14800    |
|    time_elapsed       | 4423     |
|    total_timesteps    | 1435600  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.545    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 14799    |
|    policy_loss        | -0.422   |
|    value_loss         | 0.00437  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.95e+03 |
|    ep_rew_mean        | -534     |
| time/                 |          |
|    fps                | 324      |
|    iterations         | 14900    |
|    time_elapsed       | 4451     |
|    total_timesteps    | 1445300  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.9e+03  |
|    ep_rew_mean        | -534     |
| time/                 |          |
|    fps                | 326      |
|    iterations         | 16200    |
|    time_elapsed       | 4815     |
|    total_timesteps    | 1571400  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.448    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 16199    |
|    policy_loss        | 0.389    |
|    value_loss         | 0.00438  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.92e+03 |
|    ep_rew_mean        | -531     |
| time/                 |          |
|    fps                | 326      |
|    iterations         | 16300    |
|    time_elapsed       | 4843     |
|    total_timesteps    | 1581100  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.7e+03  |
|    ep_rew_mean        | -528     |
| time/                 |          |
|    fps                | 327      |
|    iterations         | 17600    |
|    time_elapsed       | 5207     |
|    total_timesteps    | 1707200  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | 0.431    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 17599    |
|    policy_loss        | 0.404    |
|    value_loss         | 0.00457  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.68e+03 |
|    ep_rew_mean        | -533     |
| time/                 |          |
|    fps                | 327      |
|    iterations         | 17700    |
|    time_elapsed       | 5235     |
|    total_timesteps    | 1716900  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.56e+03 |
|    ep_rew_mean        | -559     |
| time/                 |          |
|    fps                | 329      |
|    iterations         | 18900    |
|    time_elapsed       | 5570     |
|    total_timesteps    | 1833300  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -0.00564 |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 18899    |
|    policy_loss        | -17.3    |
|    value_loss         | 59.5     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.55e+03 |
|    ep_rew_mean        | -561     |
| time/                 |          |
|    fps                | 329      |
|    iterations         | 19000    |
|    time_elapsed       | 5598     |
|    total_timesteps    | 1843000  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.57e+03 |
|    ep_rew_mean        | -577     |
| time/                 |          |
|    fps                | 330      |
|    iterations         | 20300    |
|    time_elapsed       | 5961     |
|    total_timesteps    | 1969100  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.94    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 20299    |
|    policy_loss        | -0.532   |
|    value_loss         | 0.00932  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.65e+03 |
|    ep_rew_mean        | -574     |
| time/                 |          |
|    fps                | 330      |
|    iterations         | 20400    |
|    time_elapsed       | 5989     |
|    total_timesteps    | 1978800  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.71e+03 |
|    ep_rew_mean        | -577     |
| time/                 |          |
|    fps                | 331      |
|    iterations         | 21600    |
|    time_elapsed       | 6325     |
|    total_timesteps    | 2095200  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.74    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 21599    |
|    policy_loss        | -0.243   |
|    value_loss         | 0.00447  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.75e+03 |
|    ep_rew_mean        | -571     |
| time/                 |          |
|    fps                | 331      |
|    iterations         | 21700    |
|    time_elapsed       | 6353     |
|    total_timesteps    | 2104900  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.73e+03 |
|    ep_rew_mean        | -585     |
| time/                 |          |
|    fps                | 332      |
|    iterations         | 23000    |
|    time_elapsed       | 6716     |
|    total_timesteps    | 2231000  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -2.84    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 22999    |
|    policy_loss        | -0.287   |
|    value_loss         | 0.00846  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.69e+03 |
|    ep_rew_mean        | -585     |
| time/                 |          |
|    fps                | 332      |
|    iterations         | 23100    |
|    time_elapsed       | 6744     |
|    total_timesteps    | 2240700  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.64e+03 |
|    ep_rew_mean        | -592     |
| time/                 |          |
|    fps                | 332      |
|    iterations         | 24300    |
|    time_elapsed       | 7079     |
|    total_timesteps    | 2357100  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -0.542   |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 24299    |
|    policy_loss        | -0.389   |
|    value_loss         | 0.00391  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.63e+03 |
|    ep_rew_mean        | -588     |
| time/                 |          |
|    fps                | 332      |
|    iterations         | 24400    |
|    time_elapsed       | 7107     |
|    total_timesteps    | 2366800  |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.61e+03 |
|    ep_rew_mean        | -586     |
| time/                 |          |
|    fps                | 333      |
|    iterations         | 25700    |
|    time_elapsed       | 7471     |
|    total_timesteps    | 2492900  |
| train/                |          |
|    entropy_loss       | -8.32    |
|    explained_variance | -1.58    |
|    learning_rate      | 1.76e-09 |
|    n_updates          | 25699    |
|    policy_loss        | -0.044   |
|    value_loss         | 0.00985  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.58e+03 |
|    ep_rew_mean        | -588     |
| time/                 |          |
|    fps                | 201      |
|    iterations         | 25800    |
|    time_elapsed       | 12393    |
|    total_timesteps    | 2502600  |
| train/                |          |
|

KeyboardInterrupt: 

In [None]:
# open tensorboard from the logs folder in terminal
# tensorboard --logdir=.

# Test the Model

In [8]:
model = A2C.load('./train/best_model_3000000.zip')

FileNotFoundError: [Errno 2] No such file or directory: 'train/best_model_3000000.zip.zip'

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=1)

In [None]:
env.close()

In [22]:
# reset game to starting state 
obs = env.reset()
# flag indicating whether died/finished or not
done = False

# how many games to play
for game in range(1):
    # until not dead/finished
    while not done:
        if done:
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        # while not done, render our game, and add information to env.step
        time.sleep(0.01)




KeyboardInterrupt: 