## Algorithm Design

1.0 CNN

In [2]:
# This is from Deep Mind. TODO:  update comments and compare with paper.
def cnn_model():
    # With the functional API we need to define the inputs.
    frames_input = layers.Input(ATARI_IMAGE_SHAPE, name='frames')
    actions_input = layers.Input((ACTION_OPTION_COUNT,), name='action_mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = layers.Lambda(lambda x: x / 255.0, name='normalization')(frames_input)

    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = layers.convolutional.Conv2D(
        16, (8, 8), strides=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = layers.convolutional.Conv2D(
        32, (4, 4), strides=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = layers.Dense(ACTION_OPTION_COUNT)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = layers.Multiply(name='QValue')([output, actions_input])

    model = Model(inputs=[frames_input, actions_input], outputs=filtered_output)
    model.summary()
    optimizer = RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=0.01)
    # model.compile(optimizer, loss='mse')
    # to changed model weights more slowly, uses MSE for low values and MAE(Mean Absolute Error) for large values
    model.compile(optimizer, loss=huber_loss)
    return model

### Initialization Methods

In [3]:
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

def get_log_dir():
    curr_time = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    record_dir = "{}/run-{}-log".format(TRAIN_DIR, curr_time)
    
def init_file_writer_to_local_dir():
    return tf.summary.FileWriter(get_log_dir(), tf.get_default_graph())

def init_history(observe):
    # At start of game, there is no preceding frame
    # So just copy initial states to make history
    state = pre_processing(observe)
    # state = preprocess(observe)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))
    return history

def total_init_history(env):
    observe = env.reset()
    for _ in range(random.randint(1, NUM_OBSERVABLE_STEPS)):
        observe, _, _, _ = env.step(1)
    # At start of game, there is no preceding frame
    # So just copy initial states to make history
    state = pre_processing(observe)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))
    return history

def init_model_clone(model):
    # Copy model since actual model weights will get updated later TODO.  when?
    # Clone model using keras api function.
    model_clone = clone_model(model)
    # Clone model weights to new model separately
    model_clone.set_weights(model.get_weights())
    return model_clone

def init_config():
    env = gym.make('BreakoutDeterministic-v4')
    # Deque is imported from collections.  Set to a finite size.  New memory will overwrite old.
    memory = deque(maxlen=400000)
    # init
    epsilon = 1.0
    # Calc decay rate.
    epsilon_decay = ((1.0 - 0.1) / 1000000)
    total_steps = 0
    # Init at 0.  player_game -> step_count
    player_game = 0
    return (env, memory, epsilon, epsilon_decay, total_steps, player_game)

def init_game_config():
    done = False
    dead = False
    game_step = 0
    game_score = 0
    game_lives = 5
    game_loss = 0.0
    return (done, dead, game_step, game_score, game_lives, game_loss)

def init_batch_matrix():
    return np.zeros((BATCH_SIZE, ATARI_IMAGE_SHAPE[0], ATARI_IMAGE_SHAPE[1], ATARI_IMAGE_SHAPE[2]))


### Helper Methods

In [4]:
def huber_loss(y, q_value):
    error = K.abs(y - q_value)
    quadratic_part = K.clip(error, 0.0, 1.0)
    linear_part = error - quadratic_part
    loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
    return loss

def find_state_and_history(observed_state, history):
    next_state = pre_processing(observed_state)
    # next_state = preprocess(observed_state)
    next_state = np.reshape([next_state], (1, 84, 84, 1))
    next_history = np.append(next_state, history)
    return (next_state, next_history)

def get_one_hot_encoded_action_mask():
    np.ones(ACTION_OPTION_COUNT).reshape(1, ACTION_OPTION_COUNT)

# Action is random if it is an observed state or if by chance based on the epsilon threshold, it is.
# If action is not random it gets generated from the current model based on history data to this point.
# I select the best action from this result.
def get_action(history, epsilon, step, model):
    is_in_observed_state = (step <= NUM_OBSERVABLE_STEPS)
    rand_choice_is_under_epsilon_threshold = (np.random.rand() <= epsilon)
    if rand_choice_is_under_epsilon_threshold or is_in_observed_state:
        return random.randrange(ACTION_OPTION_COUNT)
    else:
        q_value = model.predict([history, get_one_hot_encoded_action_mask()()])
    # Offset for 0 indexing of one-hot encoding array location of values
    return np.argmax(q_value[0]) + 1

def update_epsilon(total_steps, epsilon, epsilon_decay):
    training = (total_steps > NUM_OBSERVABLE_STEPS)
    epsilon_declining = epsilon > 0.1
    if epsilon_declining and training:
        epsilon -= epsilon_decay
    return epsilon

def breakout_from_memory(memory):
    training_batch = random.sample(memory, BATCH_SIZE)
    
    history = init_batch_matrix()
    next_history = init_batch_matrix()
    action, reward, dead = [], [], []

    # Memory is stored in: indices 0 = history, 1 = action, 2 = reward, 3 = next_history, 4 = dead
    for index, val in enumerate(training_batch):
        print('val: ', val)
        print('history index: ', history[index])
        history[index] = val[0]
        next_history[index] = val[3]
        action.append(val[1])
        reward.append(val[2])
        dead.append(val[4])
        
    return (history, next_history, action, reward, dead)

# 210*160*3(color) --> 84*84(mono)
# float --> integer (to reduce the size of replay memory)
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))


### Debugging Method
It is not necessary to run this but the training is so long it is useful to check in on how it is performing.
Some iterative logging function should be run in case the model quits while the programmer is sleeping or something.

In [5]:
# TODO: delete.
def maybe_log_stuff(model, total_steps, player_game, score, loss, step, memory, file_writer):
    if player_game % 100 == 0:
        print('player_game: {}, score: {}, total_steps: {}, avg loss: {}, step: {}, memory length: {}'
              .format(player_game, score, total_steps, loss / float(step), step, len(memory)))

    if player_game % 1000 == 0 or (player_game + 1) == NUM_TURNS:
        now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
        file_name = "training_{}.h5".format(now)
        model_path = os.path.join(TRAIN_DIR, file_name)
        model.save(model_path)

    # Add user custom data to TensorBoard
    loss_summary = tf.Summary(
        value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))])
    file_writer.add_summary(loss_summary, global_step=player_game)

    score_summary = tf.Summary(
        value=[tf.Summary.Value(tag="score", simple_value=score)])
    file_writer.add_summary(score_summary, global_step=player_game)


### Deep Q-Network Method

In [17]:
def train_memory_batch(memory, model):
    mini_batch = random.sample(memory, 32)
    history = np.zeros((32, ATARI_SHAPE[0],
                        ATARI_SHAPE[1], ATARI_SHAPE[2]))
    next_history = np.zeros((32, ATARI_SHAPE[0],
                             ATARI_SHAPE[1], ATARI_SHAPE[2]))
    target = np.zeros((32,))
    action, reward, dead = [], [], []

    # catj: memory is stored in:
        # catj: for indices 0 = history, 1 = action, 2 = reward, 3 = next_history, 4 = dead
    for idx, val in enumerate(mini_batch):
        history[idx] = val[0]
        next_history[idx] = val[3]
        action.append(val[1])
        reward.append(val[2])
        dead.append(val[4])

    actions_mask = np.ones((32, ACTION_SIZE))
    # catj: predict for each action since mask is all 1s.
    next_Q_values = model.predict([next_history, actions_mask])

    # like Q Learning, get maximum Q value at s'
    # But from target model
    for i in range(32):
        if dead[i]:
            target[i] = -1
            # target[i] = reward[i]
        else:
            # catj: Q(s, a) = r + gamma * max(Q(s', a'))
            target[i] = reward[i] + f_gamma * np.amax(next_Q_values[i])

    # catj get an action for each possible reward.
    action_one_hot = get_one_hot(action, ACTION_SIZE)
    # catj map each action to reward
    target_one_hot = action_one_hot * target[:, None]

    #tb_callback = TensorBoard(log_dir=log_dir, histogram_freq=0,
    #                          write_graph=True, write_images=False)

    h = model.fit(
        [history, action_one_hot], target_one_hot, epochs=1,
        batch_size=32, verbose=0)
        #batch_size=FLAGS.batch_size, verbose=0, callbacks=[tb_callback])

    #if h.history['loss'][0] > 10.0:
    #    print('too large')

    return h.history['loss'][0]

# This function parents the deep Q Network if the model has enough memory for batch training.
def deep_q_iteration_training(memory, total_steps, model_clone, model):
    has_reached_training_threshold = (total_steps > NUM_OBSERVABLE_STEPS)
    refresh_weights_threshold_met = (total_steps % MODEL_WEIGHTS_REFRESH_THRESOLD == 0)
    # When the model has sufficiently recorded enough memory for training, start batch training.
    if has_reached_training_threshold is True:
        model_loss = train_memory_batch(memory, model)
        # Weights on the model clone get piped through so they only get updated as often as 
        # the treshold dictates the cycle update them.
        if refresh_weights_threshold_met:
            model_clone.set_weights(model.get_weights())
        return model_loss
    return 0

def get_next_history(observed_state, history):
    # Keep state of MDP state.
    next_state, next_history = find_state_and_history(observed_state, history)
    return next_history

def update_game_lifecycle(game_lives, info):
    game_dead = game_lives > info['ale.lives']
    game_lives = info['ale.lives']
    return (game_dead, game_lives)

### Training Method

In [21]:
# Mostly this function keeps track of system states, memory, and flags
# It provides the opportunity to create logs for debugging
# Most importantly it takes an action and updates a score.
# It runs training on the model if all observation has been done.  This is Deep Q Learning.

def train():
    env, memory, epsilon, epsilon_decay, total_steps, player_games = init_config()
    # Get a copy of the cnn model with the architecture defined in a separate function.
    model = cnn_model()
    # Initialize file writer.  
    # This is just used for logging and storing the model iteratively to preserve work.
    file_writer = init_file_writer_to_local_dir()
    # The main model gets used in the Q learning training, and based on updated weights, 
    # then also updates the model clone.
    model_clone = init_model_clone(model)
    # This is just a loop to cover the range of the global number of games played.
    # The player games number is kept visible to the program for logging purposes.
    while player_games < NUM_TURNS:
        
        game_done, game_dead, game_step, game_score, game_lives, game_loss = init_game_config()
        observe = env.reset()

        for _ in range(random.randint(1, 30)):
            observe, _, _, _ = env.step(1)
        # At start of the game, there is no preceding frame
        # So just copy initial states to make history
        state = pre_processing(observe)
        # state = preprocess(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))
        
        while not game_done:
            # Epsilon has to decay a tiny bit with each iteration in the annealing method.
            epsilon = update_epsilon(total_steps, epsilon, epsilon_decay)

            # Get an action
            action = get_action(history, epsilon, total_steps, model_clone)

            # Take a step in the game
            observed_state, reward, game_done, info = env.step(action)
            
            # Update score based on agent action.
            # Move reward to the poles of 1 or -1 per the deep mind paper's suggestion
            game_reward = np.clip(reward, -1., 1.)
            game_score += game_reward
            
            # Deep Q learning begins if the observational state is complete.
            model_loss = deep_q_iteration_training(memory, total_steps, model_clone, model)
            game_loss += model_loss
            
            # Preprocess state to reduce image size, grayscale, and merge it with the history.
            # next history is the result of data augmentation and merge.
            # next_history = get_next_history(observed_state, history)
            next_state = pre_processing(observed_state)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history)
            
            game_dead, game_lives = update_game_lifecycle(game_lives, info)
            # Store history, and result of action, especially the action and reward
            # This is used in the batch sampling to train the model during the training phase.
            # This step is always used in the observing phase
            memory.append((history, action, reward, next_history, game_dead))
            
            if not game_dead:
                # Update history to include the state if the agent didn't die.
                history = next_history

            # Update counts and state flags.
            dead = False
            # These are used more of less for logging and aren't too important to the system.
            total_steps += 1
            game_step += 1
            
            if game_done:
                maybe_log_stuff(model, total_steps, player_games, game_score, game_loss, game_step, memory, file_writer)
                player_games += 1

    file_writer.close()

### Test Method

In [13]:
def test():
    env = gym.make('BreakoutDeterministic-v4')

    episode_number = 0
    epsilon = 0.001
    global_step = NUM_OBSERVABLE_STEPS + 1
    model = load_model(RESTORE_FILE_PATH, custom_objects={'huber_loss': huber_loss})

    while player_games < NUM_TURNS:
        # init variables
        game_done, game_dead, game_step, game_score, game_lives, game_loss = init_game_config()
        
        observe = env.reset()

        # Copy in initial states to amount to initial four frame history
        observe, _, _, _ = env.step(1)
        history = init_history(observe)
        while not done:
            if RENDER:
                env.render()
                time.sleep(0.01)

            # Get action.
            action = get_action(history, epsilon, player_games, model)

            # Take a step in the game.
            observed_state, reward, done, info = env.step(action)
            
            # Preprocess state to reduce image size, grayscale, and merge it with the history.
            # next history is the result of data augmentation and merge.
            next_state, next_history = find_state_and_history(observed_state)

            game_dead, game_lives = update_game_lifecycle(game_lives, info)

            # move reward to the poles of 1 or -1 per the deep mind paper's suggestion
            game_reward = np.clip(reward, -1., 1.)

            game_score += game_reward

            if not game_dead:
                # Update history to include the state if the agent didn't die.
                history = next_history

            # Update counts and state flags.
            dead = False
            # This is used more of less for logging and aren't too important to the system.
            game_step += 1
            
            if game_done:
                player_games += 1
                print('episode: {}, score: {}'.format(player_games, game_score))


## Implementation

### 0.0 Imports

In [14]:
%%capture
import gym
import random
import numpy as np
import tensorflow as tf
from keras import layers
from keras.models import Model

from collections import deque
from keras.optimizers import RMSprop
from keras import backend as K
from skimage.color import rgb2gray
from skimage.transform import resize
from datetime import datetime
import os.path
import time
from keras.models import load_model
from keras.models import clone_model
from keras.callbacks import TensorBoard

### 1.0 Constants

In [15]:
TRAIN_DIR = 'openai_breakout_training_storage'
RESTORE_FILE_PATH = '/Users/catherinejohnson/Downloads/Project Announcement-20201115/openai_breakout_training_storage/training_20201128034747.h5'
NUM_TURNS = 100000
NUM_OBSERVABLE_STEPS = 50000
MODEL_WEIGHTS_REFRESH_THRESOLD = 10000
INIT_NO_OP_STEPS = 30
REGULATION_SCALE = 0.01
BATCH_SIZE = 32
LEARNING_RATE = 0.00025
GAMMA = 0.99
RESUME = False
RENDER = False
ATARI_IMAGE_SHAPE = (84, 84, 4)
ATARI_SHAPE = (84, 84, 4)
ACTION_OPTION_COUNT = 3

In [20]:
train()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
frames (InputLayer)             (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
normalization (Lambda)          (None, 84, 84, 4)    0           frames[0][0]                     
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 20, 20, 16)   4112        normalization[0][0]              
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 9, 9, 32)     8224        conv2d_7[0][0]                   
____________________________________________________________________________________________

TypeError: 'NoneType' object is not callable

In [None]:
test()