<a href="https://colab.research.google.com/github/laurelkeys/machine-learning/blob/master/assignment-4/Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies for video displaying
_obs.:_ This may require a runtime restart

In [0]:
!pip install gym pyvirtualdisplay             > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils                    > /dev/null 2>&1 # fixes 'xdpyinfo was not found' when importing Display from pyvirtualdisplay

In [0]:
!apt-get update         > /dev/null 2>&1
!apt-get install cmake  > /dev/null 2>&1
!pip install --upgrade setuptools   2>&1
!pip install ez_setup   > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (41.6.0)


# Mount Drive
Remeber to mount your drive on your academic account for more storage ;)

In [0]:
from IPython.display import Audio, display
display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Variables to set
> `ALGORITHM` : [RL Algorithms](https://stable-baselines.readthedocs.io/en/master/guide/algos.html) | `POLICY` : [Policy Networks](https://stable-baselines.readthedocs.io/en/master/modules/policies.html) | `ENVIRONMENT`: [Atari Environments](https://gym.openai.com/envs/#atari)

In [0]:
# https://stable-baselines.readthedocs.io/en/master/guide/algos.html
ALGORITHM = "PPO2"
POLICY = "CnnLnLstmPolicy"

# https://gym.openai.com/envs/#atari
ENVIRONMENT = "Pong-v0"

**Remeber** to:
* change the `ENVIRONMENT`
* set `INITIAL_EPOCH` if `CONTINUE_TRAINING = True`

In [0]:
CONTINUE_TRAINING = False # Set this if want to continue training from a saved model, and don't forget to set INITIAL_EPOCH
TRAIN_FOREVER = False      # Set this if you want to train until the notebook disconnects

INITIAL_EPOCH = 0         # If CONTINUE_TRAINING set this to use the model trained for this many epochs with the other set parameters
EPOCHS = 1000           # Number of epochs between model saving (checkpoint)

Please don't forget to **mount your drive**, otherwise it will wait forever

# Import libs

## TensorFlow 1.15, Stable Baselines and Drive

In [0]:
#### Stable baselines only support TF 1.x for now ####
try:
    # Colab only
    # %tensorflow_version 2.x
    %tensorflow_version 1.x
except Exception:
    pass

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

1.15.0


In [0]:
import stable_baselines

from stable_baselines.common.policies import MlpPolicy, CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines.bench import Monitor as SBMonitor

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
import warnings
from time import time, strftime, sleep

In [0]:
strftime("%X")

'21:41:12'

In [0]:
import os

PATH_TO_DATA = os.path.join("drive", "My Drive", "unicamp", "MC886", "atari")
os.makedirs(PATH_TO_DATA, exist_ok=True)

!ls drive/My\ Drive/unicamp/MC886/atari/

Breakout-v0  Freeway-v0  Pong-v0


In [0]:
PATH_PREFIX = os.path.join(PATH_TO_DATA, ENVIRONMENT)
os.makedirs(PATH_PREFIX, exist_ok=True)

FILE_NAME = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH + EPOCHS}"

SAVE_PATH = os.path.join(PATH_PREFIX, FILE_NAME)

## Setup for video displaying
[Rendering OpenAi Gym in Google Colaboratory](https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/)

In [0]:
import gym
from gym.wrappers import Monitor as GymMonitor
from gym import logger as gymlogger
gymlogger.set_level(40) # error only

import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import math, glob, io, base64
from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display
display = Display(visible=0, size=(210, 160)) # images from Atari are 210x160 RGB
#display = Display(visible=0, size=(640, 480))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '210x160x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '210x160x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

# Functions to replay videos

In [0]:
PATH_VIDEO = os.path.join(PATH_PREFIX, "video")
os.makedirs(PATH_VIDEO, exist_ok=True)

# Create log dir
LOG_PATH = "tmp/"
os.makedirs(LOG_PATH, exist_ok=True)

In [0]:
def show_video():
    mp4list = glob.glob(PATH_VIDEO + '/*.mp4')
    if len(mp4list) > 0:
        mp4 = max(mp4list, key=os.path.getctime) # get the latest file
        video = io.open(mp4, 'rb').read()
        #video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" 
                    controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                  </video>'''.format(encoded.decode())))
    else: 
        print(f"Could not find any videos on folder '{PATH_VIDEO}/*.mp4'")

In [0]:
from stable_baselines.common.vec_env import VecVideoRecorder

VIDEO_LENGTH = 1000

def wrap_env(env, video_name_prefix=None):
    env = VecVideoRecorder(env, 
                           video_folder=PATH_VIDEO, 
                           record_video_trigger=lambda curr_step: curr_step == 0, # defines when to start recording
                           video_length=VIDEO_LENGTH,
                           name_prefix=f"random-agent-{ENVIRONMENT}" if video_name_prefix is None else video_name_prefix)
    return env

In [0]:
def play_and_show(model, env, show_info=False):
    observation = env.reset()

    steps = 0
    while True:
        env.render()
        action = model.predict(observation)
        observation, reward, done, info = env.step(action)
        # TODO print info and reward
        steps += 1
        if show_info:
            print(f"[step {steps}] info: {info}")
        if done:
            print(f"Done after {steps} steps")
            break

    env.close()

    sleep(1)
    show_video()

## Callback for model.learn()

In [0]:
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter

BEST_MEAN_REWARD_, N_STEPS_ = -np.inf, 0 # Copied code, pls don't judge

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN and others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global N_STEPS_, BEST_MEAN_REWARD_
    # Print stats every 1000 calls
    if (N_STEPS_ + 1) % 1000 == 0:
        # Evaluate policy training performance
        try:
            x, y = ts2xy(load_results(LOG_PATH), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                print(x[-1], 'timesteps')
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > BEST_MEAN_REWARD_:
                    BEST_MEAN_REWARD_ = mean_reward
                    # Example for saving best model
                    print("Saving new best model")
                    _locals['self'].save(os.path.join(LOG_PATH, 'best_model.pkl'))
        except Exception as e:
            print(f"Exception raised:\n{e}")
    N_STEPS_ += 1
    return True

# Trem

In [0]:
# FIXME
def make_env():
    env = gym.make(ENVIRONMENT)
    # env = GymMonitor(env, os.path.join(LOG_PATH, "gym-results"), force=True)
    env = SBMonitor(env, os.path.join(LOG_PATH, "sb-results"), allow_early_resets=True)
    env = DummyVecEnv([lambda: env])
    return env

In [0]:
env = make_env()

# Wrapped env for recording
env_record = wrap_env(DummyVecEnv([lambda: gym.make(ENVIRONMENT)]), video_name_prefix=FILE_NAME)

# check out the action space, if both aren't identical something will probably go wrong
print(env.action_space, env_record.action_space, env.action_space == env_record.action_space)

Discrete(6) Discrete(6) True


In [0]:
from stable_baselines import PPO2, ACER, ACKTR
IMPL = {
    'PPO2': PPO2,
    'ACER': ACER,
    'ACKTR': ACKTR,
}
VERBOSITY = 0 # 0 none, 1 training information, 2 tensorflow debug

In [0]:
PATH_SAVED = os.path.join(PATH_TO_DATA, ENVIRONMENT, f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH}")

if CONTINUE_TRAINING and os.path.exists(PATH_SAVED):
    model = IMPL[ALGORITHM].load(PATH_SAVED, env=env, verbose=VERBOSITY) # Load the trained agent
else:
    if os.path.exists(PATH_SAVED):
        print("Model not found, training from scratch\nINITIAL_EPOCH set to 0")
        INITIAL_EPOCH = 0
    if ALGORITHM == 'PPO2':
        model = IMPL[ALGORITHM](POLICY, env, nminibatches=1, verbose=VERBOSITY)
    else:
        model = IMPL[ALGORITHM](POLICY, env, verbose=VERBOSITY)





Instructions for updating:
Use `tf.cast` instead.





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where







In [0]:
if CONTINUE_TRAINING:
    play_and_show(model, env_record) # Show video of model

In [0]:
strftime("%X")

'21:41:39'

In [0]:
while True:
    start = time()
    model = model.learn(EPOCHS, callback=callback)
    print(f"Training took {(time() - start):.2f}s ({(time() - start) // 60} minutes)")

    # Save model
    FILE_NAME = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH + EPOCHS}"
    SAVE_PATH = os.path.join(PATH_PREFIX, FILE_NAME)
    print("Saving to", SAVE_PATH)
    model.save(SAVE_PATH)
    INITIAL_EPOCH += EPOCHS

    # Wrapped env for recording
    env_record = wrap_env(DummyVecEnv([lambda: gym.make(ENVIRONMENT)]), video_name_prefix=FILE_NAME)

    # Show video of model
    play_and_show(model, env_record) # set show_info=True to print each step's info

    print(f"The iteration took {(time() - start):.2f}s")

    if not TRAIN_FOREVER:
        print("Stop training")
        break

Training took 27.60s (0.0 minutes)
Saving to drive/My Drive/unicamp/MC886/atari/Pong-v0/PPO2_CnnLnLstmPolicy_ep1000
Saving video to  /content/drive/My Drive/unicamp/MC886/atari/Pong-v0/video/PPO2_CnnLnLstmPolicy_ep1000-step-0-to-step-1000.mp4
Done after 1044 steps


The iteration took 37.81s
Stop training


In [0]:
strftime("%X")

'21:42:17'