<a href="https://colab.research.google.com/github/laurelkeys/machine-learning/blob/master/assignment-4/Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies for video displaying
_obs.:_ This may require a runtime restart

In [0]:
!apt-get update                               > /dev/null 2>&1
!apt-get install cmake                        > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils                    > /dev/null 2>&1 # fixes 'xdpyinfo was not found' when importing Display from pyvirtualdisplay

# !pip install --upgrade setuptools 2>&1
!pip install gym[atari] pyvirtualdisplay ez_setup > /dev/null 2>&1

In [0]:
# Colab has stable-baselines version 2.2.1, but the 'reset_num_timesteps' argument of model.learn() was introduced in v2.4.1
!pip install --upgrade stable-baselines > /dev/null 2>&1
!pip list | grep stable-baselines

# Mount Drive
Remeber to mount your drive on your academic account for more storage ;)

In [0]:
from IPython.display import Audio, display
display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Variables to set
> `ALGORITHM` : [RL Algorithms](https://stable-baselines.readthedocs.io/en/master/guide/algos.html) | `POLICY` : [Policy Networks](https://stable-baselines.readthedocs.io/en/master/modules/policies.html) | `ENVIRONMENT`: [Table of environments](https://github.com/openai/gym/wiki/Table-of-environments) \( [Atari Environments](https://gym.openai.com/envs/#atari) \)

In [0]:
# https://stable-baselines.readthedocs.io/en/master/guide/algos.html
ALGORITHM = "PPO2"
POLICY = "CnnPolicy"

# https://gym.openai.com/envs/#atari
ENVIRONMENT = "PongNoFrameskip-v4"

# NOTE we're not using RAM envs, but if you do, remember not to use a CNN policy ;)
# also, you probably should be using the NoFrameskip-v4 environments (instead of the -v0 or -v4)

**Remeber** to:
* change the `ENVIRONMENT`
* set `INITIAL_EPOCH_` if `CONTINUE_TRAINING = True`
* set `MAX_EPOCHS` if `TRAIN_FOREVER = False`

In [0]:
CONTINUE_TRAINING = False  # Set this if want to continue training from a saved model, and don't forget to set INITIAL_EPOCH_
TRAIN_FOREVER = True       # Set this if you want to train until the notebook disconnects

INITIAL_EPOCH_ = 0         # If CONTINUE_TRAINING set this to use the model trained for this many epochs with the other set parameters
EPOCHS = 25000             # Number of epochs between model saving (checkpoint)
MAX_EPOCHS = None          # If not TRAIN_FOREVER set this to halt after this many epochs (the value of INITIAL_EPOCH_ doesn't interfere)

Please don't forget to **mount your drive**, otherwise it will wait forever

# Import libs

In [0]:
#### Stable baselines only support TF 1.x for now ####
try:
    # Colab only
    # %tensorflow_version 2.x
    %tensorflow_version 1.x
except Exception:
    pass

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

In [0]:
import stable_baselines

from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv, VecFrameStack, VecVideoRecorder
from stable_baselines.bench import Monitor as SBMonitor

In [0]:
import warnings
from time import time, strftime, sleep

In [0]:
strftime("%X")

In [0]:
import os

PATH_TO_DATA = os.path.join("drive", "My Drive", "unicamp", "MC886", "atari")
os.makedirs(PATH_TO_DATA, exist_ok=True)

!ls drive/My\ Drive/unicamp/MC886/atari/

In [0]:
PATH_PREFIX = os.path.join(PATH_TO_DATA, ENVIRONMENT)
os.makedirs(PATH_PREFIX, exist_ok=True)

FILE_NAME_ = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH_ + EPOCHS}"

SAVE_PATH_ = os.path.join(PATH_PREFIX, FILE_NAME_)

## Setup for video displaying
[Rendering OpenAi Gym in Google Colaboratory](https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/)

In [0]:
import gym
from gym.wrappers import Monitor as GymMonitor
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import math, glob, io, base64
from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display
display = Display(visible=0, size=(210, 160)) # images from Atari are 210x160 RGB
#display = Display(visible=0, size=(640, 480))
display.start()

# Functions to replay videos

In [0]:
PATH_VIDEO = os.path.join(PATH_PREFIX, "video")
os.makedirs(PATH_VIDEO, exist_ok=True)

# Create log dir
LOG_PATH = "tmp/"
os.makedirs(LOG_PATH, exist_ok=True)

In [0]:
def show_video(video_folder=PATH_VIDEO):
    mp4list = glob.glob(video_folder + '/*.mp4')
    if len(mp4list) > 0:
        mp4 = max(mp4list, key=os.path.getctime) # get the latest file
        video = io.open(mp4, 'rb').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" 
                    controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                  </video>'''.format(encoded.decode())))
    else: 
        print(f"Could not find any videos on folder '{video_folder}/*.mp4'")

In [0]:
VIDEO_LENGTH = 1000

def wrap_env(env, video_name_prefix=None):
    env = VecVideoRecorder(env, 
                           video_folder=PATH_VIDEO, 
                           record_video_trigger=lambda curr_step: curr_step == 0, # defines when to start recording
                           video_length=VIDEO_LENGTH,
                           name_prefix=f"random-agent-{ENVIRONMENT}" if video_name_prefix is None else video_name_prefix)
    return env

In [0]:
def play_and_show(model, env, show_info=False):
    observation = env.reset()

    steps = 0
    while True:
        env.render()
        action = model.predict(observation)
        observation, reward, done, info = env.step(action)
        steps += 1
        if show_info:
            print(f"[step {steps}] reward: {reward} | info: {info}", end="\r")
        if done:
            print(f"Done after {steps} steps")
            break

    env.close()

    sleep(1)
    show_video()

## Callback for model.learn()

In [0]:
import traceback
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter

BEST_MEAN_REWARD_, N_STEPS_ = -np.inf, 0 # Copied code, pls don't judge

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN and others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global N_STEPS_, BEST_MEAN_REWARD_
    # Print stats every 1000 calls
    if (N_STEPS_ + 1) % 1000 == 0:
        # Evaluate policy training performance
        try:
            x, y = ts2xy(load_results(LOG_PATH), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                print(x[-1], 'timesteps')
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(BEST_MEAN_REWARD_, mean_reward))

                # New best model, you could save the agent here
                if mean_reward >= BEST_MEAN_REWARD_:
                    BEST_MEAN_REWARD_ = mean_reward
                    # Example for saving best model
                    print(f"Saving new best model (at N_STEPS_={N_STEPS_})")
                    _locals['self'].save(os.path.join(LOG_PATH, 'best_model.pkl'))

                print(strftime("%X"))
        except Exception as e:
            print(f"Exception raised:")
            traceback.print_exception(type(e), e, e.__traceback__)
    N_STEPS_ += 1
    return True

# Trem

## Copying `make_atari_env` code to avoid error with latest `gym version`
[Issue #51](https://github.com/araffin/rl-baselines-zoo/issues/51)

In [0]:
from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind
from stable_baselines.common import set_global_seeds
from stable_baselines import logger

def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None,
                   start_index=0, allow_early_resets=True, start_method=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
    :param start_index: (int) start rank index
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The atari environment
    :param start_method: (str) method used to start the subprocesses.
        See SubprocVecEnv doc for more information
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = SBMonitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                            allow_early_resets=allow_early_resets)
            return wrap_deepmind(env, **wrapper_kwargs)
        return _thunk
    set_global_seeds(seed)

    # When using one environment, no need to start subprocesses
    if num_env == 1:
        return DummyVecEnv([make_env(0)])

    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)],
                         start_method=start_method)


## Model setup

In [0]:
def make_env():    
    env = make_atari_env(env_id=ENVIRONMENT, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4) # Frame-stacking with 4 frames
    return env

In [0]:
env = make_env()

# Wrapped env for recording
env_record = wrap_env(make_env(), video_name_prefix=FILE_NAME_)

# check out the action space, if both aren't the same something will probably go wrong
envs_are_eq = env.action_space == env_record.action_space
print(f"{env.action_space} == {env_record.action_space}? {envs_are_eq}")

In [0]:
from stable_baselines import PPO2, ACER, ACKTR
IMPL = {
    'PPO2': PPO2,
    'ACER': ACER,
    'ACKTR': ACKTR,
}
VERBOSITY = 0 # 0 none, 1 training information, 2 tensorflow debug

In [0]:
from IPython.display import clear_output

PATH_SAVED = os.path.join(PATH_TO_DATA, ENVIRONMENT, f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH_}.pkl")

if CONTINUE_TRAINING and os.path.exists(PATH_SAVED):
    model = IMPL[ALGORITHM].load(PATH_SAVED, env=env, verbose=VERBOSITY) # Load the trained agent
    clear_output() # clear tf warnings
    print("Model loaded from:", PATH_SAVED)
else:
    model = (IMPL[ALGORITHM](POLICY, env, verbose=VERBOSITY) if ALGORITHM != 'PPO2'
        else IMPL[ALGORITHM](POLICY, env, verbose=VERBOSITY, nminibatches=1))
    clear_output() # clear tf warnings
    if CONTINUE_TRAINING:
        print("Model not found, training from scratch")
    else:
        print("obs.: CONTINUE_TRAINING is set to False")
        if os.path.exists(PATH_SAVED):
            print(f"      but a file at '{PATH_SAVED}' exists !!")
    INITIAL_EPOCH_ = 0
    print("INITIAL_EPOCH_ set to 0\nModel created")

In [0]:
TENSORBOARD_LOG = True
# https://stable-baselines.readthedocs.io/en/master/guide/tensorboard.html

if TENSORBOARD_LOG:
    model.tensorboard_log = os.path.join(LOG_PATH, f"tb_logs")
    print(f"Adding TensorBoard logs to '{model.tensorboard_log}/'")

In [0]:
if CONTINUE_TRAINING:
    play_and_show(model, env_record) # Show video of model

## Constants
Just printing out the constants we use to make sure they're right 😅

In [0]:
print(f"ALGORITHM:         {ALGORITHM}")
print(f"POLICY:            {POLICY}")
print(f"ENVIRONMENT:       {ENVIRONMENT}")
print()
print(f"CONTINUE_TRAINING: {CONTINUE_TRAINING}")
print(f"TRAIN_FOREVER:     {TRAIN_FOREVER}")
print(f"INITIAL_EPOCH_:    {INITIAL_EPOCH_}")
print(f"EPOCHS:            {EPOCHS}")
print(f"MAX_EPOCHS:        {MAX_EPOCHS}")
print()
print(f"PATH_PREFIX:       {PATH_PREFIX}")
print(f"FILE_NAME_:        {FILE_NAME_}")
print(f"SAVE_PATH_:        {SAVE_PATH_}")
print()
print(f"PATH_VIDEO:        {PATH_VIDEO}")
print(f"LOG_PATH:          {LOG_PATH}")
print()
print(f"VERBOSITY:         {VERBOSITY}")
print(f"PATH_SAVED:        {PATH_SAVED}")
print(f"TENSORBOARD_LOG:   {TENSORBOARD_LOG}")
print()
print(f"obs.: variables ending in '_' may have their values changed") # they're not really constants

## Learn

In [0]:
strftime("%X")

In [0]:
if TENSORBOARD_LOG:
    %load_ext tensorboard
    %tensorboard --logdir tmp/tb_logs

In [0]:
FIRST_EPOCH = INITIAL_EPOCH_ # saves the value of INITIAL_EPOCH_ before we start training
while True:
    start = time()
    # Pass reset_num_timesteps=False to continue the training curve in tensorboard
    model = model.learn(EPOCHS, callback=callback, reset_num_timesteps=(INITIAL_EPOCH_ == FIRST_EPOCH))
    dt = time() - start
    print(f"\nTraining took {dt:.2f}s (~{(dt//60):.0f} minute{((dt//60) != 1) * 's'})")

    # Save model
    FILE_NAME_ = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH_ + EPOCHS}"
    SAVE_PATH_ = os.path.join(PATH_PREFIX, FILE_NAME_)
    print("Saving to", SAVE_PATH_)
    model.save(SAVE_PATH_)
    INITIAL_EPOCH_ += EPOCHS

    # Wrapped env for recording
    env_record = wrap_env(make_env(), video_name_prefix=FILE_NAME_)

    # Show video of model
    play_and_show(model, env_record) # set show_info=True to prints each step's info and reward

    dt = time() - start
    print(f"The iteration took {dt:.2f}s (~{(dt//60):.0f} minute{((dt//60) != 1) * 's'})")

    if not TRAIN_FOREVER:
        reached_max_epochs = MAX_EPOCHS is None or (INITIAL_EPOCH_ - FIRST_EPOCH) >= MAX_EPOCHS
        if reached_max_epochs:
            print(f"\nStop training (at epoch {INITIAL_EPOCH_}, started at {FIRST_EPOCH})")
            break

In [0]:
strftime("%X")