In [0]:
#Used in Google Colab
#Install Dependencies
!pip install tensorflow==1.15
!pip install stable-baselines
!pip install gym-retro atari_py



In [0]:
#Import ROMs
!python -m retro.import .

Importing 20 potential games...
Importing StreetFighterIISpecialChampionEdition-Genesis
Importing MortalKombat3-Genesis
Imported 2 games


In [0]:
import gym
import retro
import numpy as np
import os
import tensorflow as tf

from stable_baselines.common.policies import MlpPolicy, CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, A2C, results_plotter, ACKTR, ACER
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from gym.wrappers.gray_scale_observation import GrayScaleObservation
from gym.wrappers.resize_observation import ResizeObservation

tf.logging.set_verbosity(tf.logging.ERROR)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
def evaluate(model, env, num_steps=10):
  """
  Evaluate a RL agent
  :param model: (BaseRLModel object) the RL Agent
  :param num_steps: (int) number of timesteps to evaluate it
  :return: (float) Mean reward for the last 100 episodes
  """
  episode_rewards = [0.0]
  obs = env.reset()
  for i in range(num_steps):
      # _states are only useful when using LSTM policies
      action, _states = model.predict(obs)
      # here, action, rewards and dones are arrays
      # because we are using vectorized env
      obs, rewards, dones, info = env.step(action)
      
      # Stats
      episode_rewards[-1] += rewards[0]
      if dones[0]:
          obs = env.reset()
          episode_rewards.append(0.0)
  # Compute mean reward for the last 100 episodes
  mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
  print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
  
  return mean_100ep_reward

In [0]:
best_mean_reward, n_steps = -np.inf, 0

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward
    # Print stats every 1000 calls
    if (n_steps + 1) % 1000 == 0:
        # Evaluate policy training performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            #print(x[-1], 'timesteps')
            #print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                #print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    # Returning False will stop training early
    return True

In [0]:
log_dir = "/tmp/gym/"

def create_env(env_id):
  os.makedirs(log_dir, exist_ok=True)

  # Create and wrap the environment
  env = retro.make(game=env_id)

  # wrap the environment - preprocesssing
  # Originally the shape is 84 84
  shape = (42,42)
  env = ResizeObservation(env, shape)  
  env = GrayScaleObservation(env)

  # Logs will be saved in log_dir/monitor.csv
  env = Monitor(env, log_dir, allow_early_resets=True)

  # Because we use parameter noise, we should use a MlpPolicy with layer normalization
  return DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

In [0]:
#Change the 2 dictionaries and num_steps_learn below to change parameters of search
#Dictionaries are hyperparameters, num_steps_learn is length of test
params_grid_PPO2 = {
    'policy':['MlpPolicy'],
    'gamma':[0.5],
    'n_steps':[256],
    'ent_coef':[0.001],
    'learning_rate':[.001],
    'vf_coef':[0.75]
}

params_grid_A2C = {
    'policy':['MlpPolicy'],
    'gamma':[0.99],
    'n_steps':[5, 10],
    'ent_coef':[.01, 0.0125],
    'learning_rate':[0.0005, 0.00025],
    'vf_coef':[0.15, 0.35]
}

num_steps_learn = 100000
num_steps_evaluate = int(num_steps_learn/20)
env = create_env('MortalKombat3-Genesis')
#Grid Search for hyperparam tuning
def GridSearch(model, env):
  num_tests = 0
  best_reward = 0
  best_params = {}
  for pol in params_grid_PPO2['policy']:
    for gam in params_grid_PPO2['gamma']:
      for n in params_grid_PPO2['n_steps']:
        for ent in params_grid_PPO2['ent_coef']:
          for rate in params_grid_PPO2['learning_rate']:
            for vf in params_grid_PPO2['vf_coef']:
              num_tests += 1
              print('On test #' + str(num_tests))
              print('Pol: ' + pol)
              print('Gam: ' + str(gam))
              print('N: ' + str(n))
              print('Ent: ' + str(ent))
              print('Rate: ' + str(rate))
              print('Vf: ' + str(vf))
              if model == 'PPO2':
                model = PPO2(pol, env, gamma=gam, n_steps=n, ent_coef=ent, learning_rate=rate, vf_coef=vf, verbose=0)
              elif model == 'A2C':
                model = A2C(pol, env, gamma=gam, n_steps=n, ent_coef=ent, learning_rate=rate, vf_coef=vf, verbose=0)
              mean_reward_before_train = evaluate(model, env, num_steps=num_steps_evaluate)
              model.learn(total_timesteps=int(num_steps_learn), callback=callback)
              mean_reward_after_train = evaluate(model, env, num_steps=num_steps_evaluate)
              if mean_reward_after_train > best_reward:
                best_reward = mean_reward_after_train
                best_params = {
                    'policy':pol,
                    'gamma':gam,
                    'n_steps':n,
                    'ent_coef':ent,
                    'learning_rate':rate,
                    'vf_coef':vf
                }
  return best_params, best_reward

In [0]:
model = PPO2('MlpPolicy', env, gamma=0.5, n_steps=256, ent_coef=0.001, learning_rate=0.001, vf_coef=0.75)
mean_reward_before_train = evaluate(model, env, num_steps=5000)
model.learn(total_timesteps=int(50000), callback=callback)
mean_reward_after_train = evaluate(model, env, num_steps=5000)
print('Reward Before: ' + str(mean_reward_before_train))
print('Reward After 1 million time steps: ' + str(mean_reward_after_train))

Mean reward: 157.0 Num episodes: 2
Mean reward: 130.0 Num episodes: 2
Reward Before: 157.0
Reward After 1 million time steps: 130.0


In [0]:
# Save the agent
import os
#os.mkdir('models')
model.save("models/ppo2_mk3")
del model  # delete trained model to demonstrate loading

In [0]:
model = PPO2.load("models/ppo2_mk3")

In [0]:
env.close()

In [0]:
import gym
from stable_baselines.common.vec_env import VecVideoRecorder

env_id = 'MortalKombat3-Genesis'
video_folder = 'logs/videos/'
video_length = 100000

#env = create_env('MortalKombat3-Genesis')

obs = env.reset()

# Record the video starting at the first step
env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="random-agent-{}".format(env_id))

env.reset()

array([[[ 64,  55,  64, ...,  64,  55,  64],
        [ 64,  55,  64, ...,  64,  55,  64],
        [ 55,  56,  94, ..., 106,  56,  55],
        ...,
        [ 44,  43,  45, ...,  45,  43,  44],
        [ 46,  45,  44, ...,  44,  45,  46],
        [ 47,  43,  46, ...,  46,  43,  47]]], dtype=uint8)

In [0]:
import imageio

model.set_env(env)

images = []
obs = model.env.reset()
img = model.env.render(mode='rgb_array')
done = False
while not done:
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, done ,_ = model.env.step(action)
    img = model.env.render(mode='rgb_array')

In [0]:
imageio.mimsave('stf2_ac2_300kEpisodes.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)