In [1]:
import collections
import random
import datetime

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import gym_super_mario_bros

from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros import actions

%load_ext tensorboard

In [2]:
state_shape = (56, 56, 1)
learning_rate = 0.00025
discount_factor = 0.99
epsilon = 0.8
eps_decay = 0.999
update_target_network_interval = 100
action_set = gym_super_mario_bros.actions.RIGHT_ONLY

In [3]:
logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, write_grads=True)
saving_callback = tf.keras.callbacks.ModelCheckpoint('/tmp/mario0', period=500)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()



In [4]:
def create_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
                                     input_shape=state_shape))
    model.add(tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu',
                                     kernel_initializer='he_normal'))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
                                     kernel_initializer='he_normal'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='relu', use_bias=False))
    model.add(tf.keras.layers.Dense(5, use_bias=False))
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = learning_rate),
                  loss = 'mse', metrics=['mse'])
    return model
        

In [5]:
model = create_model()
model.summary()
target_model = create_model()
target_model.set_weights(model.get_weights())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 13, 13, 32)        2080      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 5, 5, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               294912    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2560      
Total params: 369,312
Trainable params: 369,312
Non-trainable params: 0
__________________________________________________

In [6]:
def update_target_network(
    episode, update_target_network_interval, main_network, target_network):
    if ((episode+1) % update_target_network_interval) == 0:
        target_network.set_weights(main_network.get_weights())
    return target_network

In [7]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, action_set)

In [8]:
def select_action(epsilon, state):
    if (np.random.random() <= epsilon):
        return np.random.choice(len(action_set))
    else:
        return np.argmax(model.predict(state))

In [9]:
def greyscale(state):
    return tf.image.rgb_to_grayscale([state])[0]

def resize(state):
    return tf.compat.v1.image.resize_images([state], (state_shape[0], state_shape[1]))[0]

def downsample(state):
    state = resize(state)
    state = greyscale(state)
    state = (state - 128) / 128  # state is in the [0,255] range.
    return tf.cast(tf.reshape(state, (1,) + state_shape), tf.dtypes.bfloat16)

In [10]:
def compute_bellman_target(discount_factor, reward, model, state_next, done):
    if done:
        return reward
    return (reward + discount_factor * np.max(model.predict(state_next)))

In [11]:
initial_epoch = 0

def sample_from_replay_buffer_and_train_model(replay_buffer, batch_size, 
                                              model, target_model, discount_factor):
    global initial_epoch
    if(len(replay_buffer) >= batch_size):
        batch = random.sample(replay_buffer, batch_size)
        
        states = [item[0] for item in batch]
        actions = [item[1] for item in batch]
        rewards = [item[2]/15.0 for item in batch]  # rewards are in the [-15,15] range.
        states_next = [item[3] for item in batch]  
        dones = [item[4] for item in batch]
        
        stacked_states = np.empty(shape=(0,) + state_shape)
        for state in states:
            stacked_states = tf.concat((stacked_states, state), axis=0)

        # TODO:  I changed this from target_model.predict.  Which way is right? 
        # I think model.predict is right and therefore at least one tutorial is
        # wrong.
        target_q_values = model.predict(stacked_states)
        for i in range(len(states)):
            target_q_values[i, actions[i]] = compute_bellman_target(
                discount_factor, rewards[i], target_model, states_next[i], dones[i])
        
        def summarize_q_values(epoch, logs): 
            # TODO: move this function out of the way.
            if epoch % 10 != 0: return
            tf.summary.scalar('actions', data=tf.reduce_mean(actions), step=epoch)
            tf.summary.scalar('rewards', data=tf.reduce_mean(rewards), step=epoch)
            bt = [target_q_values[i, actions[i]] for i in range(len(states))]
            tf.summary.scalar('bellman_target', data=tf.reduce_mean(bt), step=epoch)
            delta = [rewards[i] - bt[i] for i in range(len(states))]
            tf.summary.scalar('delta-reward-bt', data=tf.reduce_mean(delta), step=epoch)
            tf.summary.scalar('target_q_values', data=tf.reduce_mean(target_q_values), step=epoch)
            tf.summary.scalar('max_rewards', data=tf.reduce_max(rewards), step=epoch)
            tf.summary.scalar('max_q', data=tf.reduce_max(target_q_values), step=epoch)            
            tf.summary.scalar('min_q', data=tf.reduce_min(target_q_values), step=epoch)            
            zeros = [np.count_nonzero(t==0) for t in target_q_values]
            tf.summary.scalar('zeros_q', data=tf.reduce_mean(zeros), step=epoch)                        
        summarize = tf.keras.callbacks.LambdaCallback(on_epoch_begin=summarize_q_values)

        model.fit(stacked_states, target_q_values,
                  epochs=initial_epoch + 1, initial_epoch=initial_epoch,
                  verbose=False, 
                  callbacks=[tensorboard_callback, saving_callback, summarize])
        initial_epoch += 1
    return model

In [12]:
batch_size = 16
replay_buffer_size = 2000
replay_buffer = collections.deque(maxlen = replay_buffer_size)

In [13]:
%tensorboard --logdir logs/scalars

Reusing TensorBoard on port 6007 (pid 71376), started 0:35:43 ago. (Use '!kill 71376' to kill it.)

In [14]:
for episode in range(10000):
    state = downsample(env.reset())
    done = False
    
    while not done:
        action = select_action(epsilon, state)
        state_next, reward, done, info = env.step(action)
        state_next = downsample(state_next)
        replay_buffer.append((state, action, reward, state_next, done))
        model = sample_from_replay_buffer_and_train_model(
            replay_buffer, batch_size, model, target_model, discount_factor)
        state = state_next 
        
    target_network = update_target_network(
            episode, update_target_network_interval, model, target_model)
    epsilon = max(epsilon * eps_decay, 0.01)



KeyboardInterrupt: 