In [1]:
import collections
import datetime
import math

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import gym_super_mario_bros

from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros import actions

%load_ext tensorboard

In [2]:
state_shape = (84, 84, 1)
learning_rate = 0.00025
discount_factor = 0.9999
update_target_network_interval = 100
action_set = gym_super_mario_bros.actions.RIGHT_ONLY

In [3]:
logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1)
saving_callback = tf.keras.callbacks.ModelCheckpoint('/tmp/mario0', period=1000)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()



In [4]:
def create_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
                                     input_shape=state_shape))
    model.add(tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu',
                                     kernel_initializer='he_normal'))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
                                     kernel_initializer='he_normal'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='relu', use_bias=False))
    model.add(tf.keras.layers.Dense(5, use_bias=False))
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = learning_rate),
                  loss = 'mse', metrics=['mse'])
    return model
        

In [5]:
model = create_model()
model.summary()
target_model = create_model()
target_model.set_weights(model.get_weights())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 32)        2080      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3136)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1605632   
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2560      
Total params: 1,680,032
Trainable params: 1,680,032
Non-trainable params: 0
______________________________________________

In [6]:
def update_target_network(
    episode, update_target_network_interval, main_network, target_network):
    if ((episode+1) % update_target_network_interval) == 0:
        target_network.set_weights(main_network.get_weights())
    return target_network

In [7]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, action_set)

In [8]:
def select_action(epsilon, state):
    if (np.random.random() <= epsilon):
        return np.random.choice(len(action_set))
    else:
        return np.argmax(model.predict(state))

In [9]:
def greyscale(state):
    return tf.image.rgb_to_grayscale([state])[0]

def resize(state):
    return tf.compat.v1.image.resize_images([state], (state_shape[0], state_shape[1]))[0]

def normalize_image(state):
    # return (state - 128) / 128.0  # state is in the [0,255] range.
#     return (state - 155) / 75.0  # state is in the [0,255] range.
    return tf.image.per_image_standardization(state)

def downsample(state):
    state = resize(state)
    state = greyscale(state)
    state = normalize_image(state)
    return tf.cast(tf.reshape(state, (1,) + state_shape), tf.dtypes.bfloat16)

def normalize_reward(reward):
    return reward / 15.0 # reward is in the [-15,15] range.

In [10]:
def compute_bellman_target(discount_factor, reward, model, state_next, done):
    if done:
        return reward
    return (reward + discount_factor * np.max(model.predict(state_next)))

In [11]:
beta_start = 0.4
beta_frames = 1000 
beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)
#TODO: rename frame_idx to episode_id
alpha_start = 0.6
alpha_frames = 1000 
alpha_by_frame = lambda frame_idx: min(1.0, alpha_start + frame_idx * (1.0 - alpha_start) / alpha_frames)

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

alpha = alpha_start
beta = beta_start
epsilon = epsilon_start

In [12]:
class PrioritizedBuffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.position = 0
        self.buffer = []
        self.priorities = np.zeros((capacity,), dtype=np.float32)
    
    def append(self, state, action, reward, state_next, done, info):
        max_priority = self.priorities.max() if self.buffer else 1.0
        
        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, state_next, done, info))
        else:
            self.buffer[self.position] = (state, action, reward, state_next, done, info)
        
        self.priorities[self.position] = max_priority
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size, alpha=0.6, beta=0.4):
        if len(self.buffer) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.position]
        
        probabilities = priorities ** alpha
        probabilities /= probabilities.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        samples = [self.buffer[idx] for idx in indices]
        
        total = len(self.buffer)
        weights = (total * probabilities[indices]) ** (-beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)
        
        batch = list(zip(*samples))
        states = batch[0]
        actions = batch[1]
        rewards = batch[2]
        states_next = batch[3]
        dones = batch[4]
        infos = batch[5]
        
        return states, actions, rewards, states_next, dones, infos, indices, weights
    
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, priority in zip(batch_indices, batch_priorities):
            self.priorities[idx] = priority
            

    def __len__(self):
        return len(self.buffer)

In [13]:
initial_epoch = 0

In [14]:
# model = tf.keras.models.load_model('/tmp/mario0')
# target_model.set_weights(model.get_weights())

In [15]:
def sample_from_replay_buffer_and_train_model(replay_buffer, batch_size, 
                                              model, target_model, discount_factor):
    global initial_epoch
    if(len(replay_buffer) >= batch_size):
        (states, actions, rewards, states_next, dones, infos, 
         indices, weights) = replay_buffer.sample(batch_size, alpha, beta)
        
        stacked_states = np.empty(shape=(0,) + state_shape)
        for state in states:
            stacked_states = tf.concat((stacked_states, state), axis=0)

        target_q_values = model.predict(stacked_states)
        td_errors = np.zeros((len(states),), dtype=np.float32)
        for i in range(len(states)):
            updated_target = compute_bellman_target(
                discount_factor, rewards[i], target_model, states_next[i], dones[i])
            td_errors[i] = target_q_values[i, actions[i]] - updated_target
            target_q_values[i, actions[i]] = updated_target

        def summarize_q_values(epoch, logs): 
            # TODO: move this function out of the way.
            if epoch % 20 != 0: return
            tf.summary.scalar('actions', data=tf.reduce_mean(actions), step=epoch)
            tf.summary.scalar('rewards', data=tf.reduce_mean(rewards), step=epoch)
            bt = [target_q_values[i, actions[i]] for i in range(len(states))]
            tf.summary.scalar('bellman_target', data=tf.reduce_mean(bt), step=epoch)
            delta = [rewards[i] - bt[i] for i in range(len(states))]
            tf.summary.scalar('delta-reward-bt', data=tf.reduce_mean(delta), step=epoch)
            tf.summary.scalar('target_q_values', data=tf.reduce_mean(target_q_values), step=epoch)
            tf.summary.scalar('max_rewards', data=tf.reduce_max(rewards), step=epoch)
            tf.summary.scalar('max_q', data=tf.reduce_max(target_q_values), step=epoch)            
            tf.summary.scalar('min_q', data=tf.reduce_min(target_q_values), step=epoch)            
            zeros = [np.count_nonzero(t==0) for t in target_q_values]
            tf.summary.scalar('zeros_q', data=tf.reduce_mean(zeros), step=epoch)                        
            tf.summary.scalar('epsilon', data=epsilon, step=epoch)
            x, y, t = zip(*[(i['x_pos'],i['y_pos'],i['time']) for i in infos])
            tf.summary.scalar('x_pos', data=tf.reduce_mean(x), step=epoch)
            tf.summary.scalar('y_pos', data=tf.reduce_mean(y), step=epoch)
            tf.summary.scalar('time', data=tf.reduce_mean(t), step=epoch)
            tf.summary.scalar('batch_weights', data=tf.reduce_mean(weights), step=epoch)
            tf.summary.scalar('td_errors', data=tf.reduce_mean(td_errors), step=epoch)
            if epoch % 500 == 0:
                tf.summary.image('frames', data=colorize(stacked_states, cmap='viridis'), step=epoch)
            tf.summary.histogram('inputs', stacked_states, step=epoch)
        summarize = tf.keras.callbacks.LambdaCallback(on_epoch_begin=summarize_q_values)

        model.fit(stacked_states, target_q_values, sample_weight=weights,
                  epochs=initial_epoch + 1, initial_epoch=initial_epoch,
                  verbose=False, 
                  callbacks=[tensorboard_callback, saving_callback, summarize])
        td_errors = (np.power(td_errors, 2) * weights) + 1e-5
        replay_buffer.update_priorities(indices, td_errors)
        initial_epoch += 1
    return model

In [16]:
batch_size = 32
replay_buffer_size = 10000
replay_buffer = PrioritizedBuffer(replay_buffer_size)
latest_rewards = collections.deque(maxlen=50)

In [17]:
import matplotlib
import matplotlib.cm
import tensorflow as tf

def colorize(value, vmin=None, vmax=None, cmap=None):
# normalize
    vmin = tf.reduce_min(value) if vmin is None else vmin
    vmax = tf.reduce_max(value) if vmax is None else vmax
    value = (value - vmin) / (vmax - vmin) # vmin..vmax
# squeeze last dim if it exists
    value = tf.squeeze(value)
# quantize
    indices = tf.cast(tf.round(value * 255), tf.dtypes.int32)
# gather
    cm = matplotlib.cm.get_cmap(cmap if cmap is not None else 'gray')
    colors = tf.constant(cm.colors, dtype=tf.float32)
    value = tf.gather(colors, indices)
    return value

In [18]:
%tensorboard --logdir logs/scalars

Reusing TensorBoard on port 6007 (pid 71376), started 1 day, 8:58:58 ago. (Use '!kill 71376' to kill it.)

In [None]:
for episode in range(1000):
    state = downsample(env.reset())
    done, low_rewards_rate, _ = False, 0, latest_rewards.clear()
    
    while not done and low_rewards_rate < 1.00:
        action = select_action(epsilon, state)
        state_next, reward, done, info = env.step(action)
        
        latest_rewards.append(reward)
        state_next, reward = downsample(state_next), normalize_reward(reward)
        
        replay_buffer.append(state, action, reward, state_next, done, info)
        model = sample_from_replay_buffer_and_train_model(
            replay_buffer, batch_size, model, target_model, discount_factor)
        state = state_next
        low_rewards_rate = sum([r < 1 for r in latest_rewards]) / latest_rewards.maxlen
        
    target_network = update_target_network(
            episode, update_target_network_interval, model, target_model)
    
    alpha = alpha_by_frame(episode)
    beta = beta_by_frame(episode)
    epsilon = epsilon_by_frame(episode)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /tmp/mario0/assets




INFO:tensorflow:Assets written to: /tmp/mario0/assets


INFO:tensorflow:Assets written to: /tmp/mario0/assets
