<a href="https://colab.research.google.com/github/laurelkeys/machine-learning/blob/master/assignment-4/Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Variables to set

In [0]:
# https://stable-baselines.readthedocs.io/en/master/guide/algos.html
ALGORITHM = "PPO2"
POLICY = "CnnLnLstmPolicy"

# https://gym.openai.com/envs/#atari
ENVIRONMENT = "Breakout-v0"

In [0]:
CONTINUE_TRAINING = True  # Set this if want to continue training from a saved model, and don't forget to set INITIAL_EPOCH
TRAIN_FOREVER = True      # Set this if you want to train until the notebook disconnects

INITIAL_EPOCH = 1700000    # If CONTINUE_TRAINING set this to use the model trained for this many epochs with the other set parameters
EPOCHS = 2000           # Number of epochs between model saving (checkpoint)

Please don't forget to **mount your drive**, otherwise it will wait forever

# Imports

In [0]:
#### Stable baselines only support TF 1.x for now ####
try:
    # Colab only
    # %tensorflow_version 2.x
    %tensorflow_version 1.x
except Exception:
    pass

In [0]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

1.15.0


In [0]:
import stable_baselines

from stable_baselines.common.policies import MlpPolicy, CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import PPO2
from stable_baselines.bench import Monitor

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
import warnings
from time import time, strftime, sleep

In [0]:
strftime("%X")

'13:48:33'

In [0]:
from IPython.display import Audio, display
display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

# remeber to mount your drive on your academic account for more storage ;)

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
import os

PATH_TO_DATA = os.path.join("drive", "My Drive", "unicamp", "MC886", "atari")

if not os.path.exists(PATH_TO_DATA):
    os.makedirs(PATH_TO_DATA)

!ls drive/My\ Drive/unicamp/MC886/atari/

Breakout-ram-v0  Breakout-v0  CartPole-v1


In [0]:
PATH_PREFIX = os.path.join(PATH_TO_DATA, ENVIRONMENT)

if not os.path.exists(PATH_PREFIX):
    os.makedirs(PATH_PREFIX)

FILE_NAME = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH + EPOCHS}"

SAVE_PATH = os.path.join(PATH_PREFIX, FILE_NAME)

# Setup for video displaying

[Rendering OpenAi Gym in Google Colaboratory](https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/)

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (41.6.0)


In [0]:
import gym
from gym import logger as gymlogger
# from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(640, 480))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '640x480x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '640x480x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

# Functions for video

In [0]:
PATH_VIDEO = os.path.join(PATH_PREFIX, "video")

if not os.path.exists(PATH_VIDEO):
    os.makedirs(PATH_VIDEO)

In [0]:
def show_video():
    mp4list = glob.glob(PATH_VIDEO + '/*.mp4')
    if len(mp4list) > 0:
        # mp4 = mp4list[0]
        mp4 = max(mp4list, key=os.path.getctime) # get latest file
        video = io.open(mp4, 'rb').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" 
                    controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                  </video>'''.format(encoded.decode())))
    else: 
        print("Could not find video")

In [0]:
from stable_baselines.common.vec_env import VecVideoRecorder
video_length = 1000

def wrap_env(env, path=""):
    if path == "":
        path = f"random-agent-{ENVIRONMENT}"
    env = VecVideoRecorder(env, 
                           video_folder=PATH_VIDEO, 
                           record_video_trigger=lambda curr_step: curr_step == 0, # defines when to start recording
                           video_length=video_length,
                           name_prefix=path)
    return env

In [0]:
def play_and_show(model, env):
    obs = env.reset()

    steps = 0
    while True:
        env.render()
        action = model.predict(obs)
        # action = env.action_space.sample() 
        # print(action)
        observation, reward, done, info = env.step(action)
        steps += 1
        if done:
            print(f"Done after {steps} steps")
            break

    env.close()

    sleep(1)
    show_video()

## Functions

In [0]:
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter

BEST_MEAN_REWARD_, N_STEPS_ = -np.inf, 0 # Copied code, pls don't judge
# Create log dir
LOG_PATH = "tmp/"
os.makedirs(LOG_PATH, exist_ok=True)

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN and others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global N_STEPS_, BEST_MEAN_REWARD_
    # Print stats every 1000 calls
    if (N_STEPS_ + 1) % 1000 == 0:
        # Evaluate policy training performance
        try:
            x, y = ts2xy(load_results(LOG_PATH), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                print(x[-1], 'timesteps')
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > BEST_MEAN_REWARD_:
                    BEST_MEAN_REWARD_ = mean_reward
                    # Example for saving best model
                    print("Saving new best model")
                    _locals['self'].save(LOG_PATH + 'best_model.pkl')
            else:
              print("Empty monitor.csv")
        except:
          print("No monitor.csv found")
    N_STEPS_ += 1
    return True

# Trem

In [0]:
env = Monitor(gym.make(ENVIRONMENT), LOG_PATH, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
# env = Monitor(env, LOG_PATH, allow_early_resets=True)

# Wrapped env for recording
env_record = wrap_env(DummyVecEnv([lambda: gym.make(ENVIRONMENT)]), path=FILE_NAME)

# check out the action space, if both aren't identical something will probably go wrong
print(env.action_space, env_record.action_space, env.action_space == env_record.action_space)

Discrete(4) Discrete(4) True


In [0]:
PATH_SAVED = os.path.join(PATH_TO_DATA, ENVIRONMENT, f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH}")
if CONTINUE_TRAINING and os.path.exists(PATH_SAVED):
    model = PPO2.load(PATH_SAVED, env=env) # Load the trained agent
else:
    if os.path.exists(PATH_SAVED):
        print("Model not found, training from scratch")
        INITIAL_EPOCH = 0
    model = PPO2(POLICY, env, nminibatches=1)





Instructions for updating:
Use `tf.cast` instead.





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where







In [0]:
if CONTINUE_TRAINING:
    play_and_show(model, env_record) # Show video of model

Done after 221 steps


In [0]:
while True:
    start = time()
    model = model.learn(EPOCHS, callback=callback)
    print(f"Training took {time() - start}s")

    # Save model
    FILE_NAME = f"{ALGORITHM}_{POLICY}_ep{INITIAL_EPOCH + EPOCHS}"
    SAVE_PATH = os.path.join(PATH_PREFIX, FILE_NAME)
    print("Saving to", SAVE_PATH)
    model.save(SAVE_PATH)
    INITIAL_EPOCH += EPOCHS

    # Wrapped env for recording
    env_record = wrap_env(DummyVecEnv([lambda: gym.make(ENVIRONMENT)]), path=FILE_NAME)

    # Show video of model
    play_and_show(model, env_record)

    print(f"The iteration took {time() - start}s")

    if not TRAIN_FOREVER:
        print("Stop training")
        break

Training took 38.37638211250305s
Saving to drive/My Drive/unicamp/MC886/atari/Breakout-v0/PPO2_CnnLnLstmPolicy_ep1702000
Done after 327 steps


The iteration took 42.810248374938965s
Training took 16.055640697479248s
Saving to drive/My Drive/unicamp/MC886/atari/Breakout-v0/PPO2_CnnLnLstmPolicy_ep1704000
Done after 809 steps


The iteration took 22.578294277191162s
Training took 16.11047649383545s
Saving to drive/My Drive/unicamp/MC886/atari/Breakout-v0/PPO2_CnnLnLstmPolicy_ep1706000
Saving video to  /content/drive/My Drive/unicamp/MC886/atari/Breakout-v0/video/PPO2_CnnLnLstmPolicy_ep1706000-step-0-to-step-1000.mp4
Done after 1692 steps


The iteration took 26.83969497680664s


KeyboardInterrupt: ignored