In [None]:
import collections
import datetime
import math

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import gym_super_mario_bros

from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros import actions

In [None]:
state_shape = (84, 84)
learning_rate = 0.00025
discount_factor = 0.99
update_target_network_interval = 10
action_set = gym_super_mario_bros.actions.COMPLEX_MOVEMENT

In [None]:
logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, update_freq=500)
saving_callback = tf.keras.callbacks.ModelCheckpoint('models/mario0', period=1000, 
                                                     save_weights_only=True)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()

In [None]:
def create_model():
    model = tf.keras.Sequential()
    # Conv2D with data_format='channels_first' doesn't have CPU kernel without MKL.  Thus, the transpose:
    model.add(tf.keras.layers.Permute((2, 3, 1), input_shape=(4,) + state_shape))
    model.add(tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu', 
                               kernel_initializer='he_normal',
                               bias_initializer='ones'))
    model.add(tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu', 
                               kernel_initializer='he_normal',
                               bias_initializer='ones'))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu', 
                               kernel_initializer='he_normal',
                               bias_initializer='ones'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='relu', 
                              kernel_initializer='he_normal',
                              bias_initializer='ones'))
    model.add(tf.keras.layers.Dense(len(action_set), kernel_initializer='he_normal',
                              bias_initializer='ones'))
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = learning_rate),
                  loss = 'mse', metrics=['mse'])
    return model
        

In [None]:
model = create_model()
model.summary()
target_model = create_model()
target_model.set_weights(model.get_weights())

In [None]:
def update_target_network(
    episode, update_target_network_interval, main_network, target_network):
    if ((episode+1) % update_target_network_interval) == 0:
        target_network.set_weights(main_network.get_weights())
    return target_network

In [None]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, action_set)

In [None]:
def select_action(epsilon, state):
    if (np.random.random() <= epsilon):
        return np.random.choice(len(action_set))
    else:
        state = tf.reshape(tf.concat(state, axis=0), (1,4,) + state_shape)
        return np.argmax(model.predict(state))

In [None]:
def greyscale(state):
    return tf.image.rgb_to_grayscale([state])[0]

def resize(state):
    return tf.compat.v1.image.resize_images([state], (state_shape[0], state_shape[1]))[0]

def normalize_image(state):
    return tf.image.per_image_standardization(state)

def downsample(state):
    state = resize(state)
    state = greyscale(state)
    state = normalize_image(state)
    return tf.cast(tf.reshape(state, (1,) + state_shape), tf.dtypes.bfloat16)

def normalize_reward(reward):
    return reward / 12.0 # rewards are in the [-15,15] range

In [None]:
def compute_bellman_target(discount_factor, reward, model, state_next, done):
    if done:
        return reward
    return (reward + discount_factor * np.max(model.predict(state_next)))

In [None]:
beta_start = 0.4
beta_frames = 1000 
beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)
#TODO: rename frame_idx to episode_id
alpha_start = 0.6
alpha_frames = 1000 
alpha_by_frame = lambda frame_idx: min(1.0, alpha_start + frame_idx * (1.0 - alpha_start) / alpha_frames)

epsilon_start = 0.99
epsilon_final = 0.01
epsilon_decay = 500
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

alpha = alpha_start
beta = beta_start
epsilon = epsilon_start

In [None]:
class PrioritizedBuffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.position = 0
        self.buffer = []
        self.priorities = np.zeros((capacity,), dtype=np.float32)
    
    def append(self, frame):#state, action, reward, state_next, done, info):
        max_priority = self.priorities.max() if self.buffer else 1.0
        
        if len(self.buffer) < self.capacity:
            self.buffer.append(frame)
        else:
            self.buffer[self.position] = frame
        
        self.priorities[self.position] = max_priority
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size, alpha=0.6, beta=0.4):
        if len(self.buffer) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.position]
        
        probabilities = priorities ** alpha
        probabilities /= probabilities.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        samples = [self.buffer[idx] for idx in indices]
        
        total = len(self.buffer)
        weights = (total * probabilities[indices]) ** (-beta)
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)
        
        states, actions, rewards, states_next, dones, infos = [], [], [], [], [], []
        for frame in samples:            
            batch = list(zip(*frame))
            states.append(tf.reshape(tf.concat([s for s in batch[0]], axis=0), (1,4,) + state_shape))
            actions.append(batch[1][-1])
            rewards.append(batch[2][-1])
            states_next.append(tf.reshape(tf.concat([s for s in batch[3]], axis=0), (1,4,) + state_shape))
            dones.append(batch[4][-1])
            infos.append(batch[5][-1])
        
        return states, actions, rewards, states_next, dones, infos, indices, weights
    
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, priority in zip(batch_indices, batch_priorities):
            self.priorities[idx] = priority
            

    def __len__(self):
        return len(self.buffer)

In [None]:
current_epoch = 0

In [None]:
def sample_from_replay_buffer_and_train_model(replay_buffer, batch_size, 
                                              model, target_model, discount_factor):
    global current_epoch
    if(len(replay_buffer) >= batch_size):
        (states, actions, rewards, states_next, dones, infos, 
         indices, weights) = replay_buffer.sample(batch_size, alpha, beta)
        
        stacked_states = np.empty(shape=(0,4,) + state_shape)
        for state in states:
            stacked_states = tf.concat((stacked_states, state), axis=0)

        target_q_values = model.predict(stacked_states)
        td_errors = np.zeros((len(states),), dtype=np.float32)
        for i in range(len(states)):
            updated_target = compute_bellman_target(
                discount_factor, rewards[i], target_model, states_next[i], dones[i])
            td_errors[i] = updated_target - target_q_values[i, actions[i]]
            target_q_values[i, actions[i]] = updated_target

        def summarize_q_values(epoch, logs): 
            # TODO: move this function out of the way.
            if epoch % 100 != 0: return
            tf.summary.scalar('bellman_target', data=tf.reduce_mean(bt), step=epoch)
            tf.summary.scalar('target_q_values', data=tf.reduce_mean(target_q_values), step=epoch)
            tf.summary.scalar('epsilon', data=epsilon, step=epoch)
            x, y, t = zip(*[(i['x_pos'],i['y_pos'],i['time']) for i in infos])
            tf.summary.scalar('x_pos', data=tf.reduce_mean(x), step=epoch)
            tf.summary.scalar('y_pos', data=tf.reduce_mean(y), step=epoch)
            tf.summary.scalar('time', data=tf.reduce_mean(t), step=epoch)
            tf.summary.scalar('td_errors', data=tf.reduce_mean(td_errors), step=epoch)
        summarize = tf.keras.callbacks.LambdaCallback(on_epoch_begin=summarize_q_values)

        model.fit(stacked_states, target_q_values, sample_weight=weights,
                  epochs=current_epoch + 1, initial_epoch=current_epoch,
                  verbose=False, 
                  callbacks=[tensorboard_callback, 
                             summarize])
        td_errors = (np.power(td_errors, 2) * weights) + 1e-5
        replay_buffer.update_priorities(indices, td_errors)
        current_epoch += 1
    return model

In [None]:
batch_size = 8
replay_buffer_size = 5000
replay_buffer = PrioritizedBuffer(replay_buffer_size)
overlapping_buffer = collections.deque(maxlen=4)

In [None]:
import matplotlib
import matplotlib.cm
import tensorflow as tf

def colorize(value, vmin=None, vmax=None, cmap=None):
    value = [b[0] for b in value]
    vmin = tf.reduce_min(value) if vmin is None else vmin
    vmax = tf.reduce_max(value) if vmax is None else vmax
    value = (value - vmin) / (vmax - vmin) # vmin..vmax
    value = tf.squeeze(value)
    indices = tf.cast(tf.round(value * 255), tf.dtypes.int32)
    cm = matplotlib.cm.get_cmap(cmap if cmap is not None else 'gray')
    colors = tf.constant(cm.colors, dtype=tf.float32)
    value = tf.gather(colors, indices)
    return value

In [None]:
for episode in range(1000):
    state = env.reset()
    for _ in range(np.random.randint(0, 133)):
        state, _, _, _ = env.step(0)
    state = downsample(state)
    frame_states = [state, state, state, state]
    
    done = False
    while not done:
        action = select_action(epsilon, frame_states)
        
        total_reward = 0
        for _ in range(4):
            state_next, reward, done, info = env.step(action)
            if done: break
            total_reward += reward
            
        state_next, reward = downsample(state_next), normalize_reward(total_reward)
        overlapping_buffer.append((state, action, reward, state_next, done, info))
        
        if len(overlapping_buffer) == overlapping_buffer.maxlen:
            frame = [s for s in overlapping_buffer]
            frame_states =  [s[0] for s in overlapping_buffer]
            replay_buffer.append(frame)
        
        model = sample_from_replay_buffer_and_train_model(
            replay_buffer, batch_size, model, target_model, discount_factor)
        state = state_next
        
        if current_epoch % 1000 == 0:
            model.save('models/mario0')
            model = tf.keras.models.load_model('models/mario0')
        
    target_network = update_target_network(
            episode, update_target_network_interval, model, target_model)
    
    alpha = alpha_by_frame(episode)
    beta = beta_by_frame(episode)
    epsilon = epsilon_by_frame(episode)