In [None]:
!caffeinate -t (3600 * 37)

import gym
import random
import numpy as np
import tensorflow as tf
from keras import layers
from keras.models import Model

from collections import deque
from keras.optimizers import RMSprop
from keras import backend as K
from skimage.color import rgb2gray
from skimage.transform import resize
from datetime import datetime
import os.path
import time
from keras.models import load_model
from keras.models import clone_model
from keras.callbacks import TensorBoard


f_train_dir = 'tf_train_breakout'
f_restore_file_path = '/Users/catherinejohnson/projects/CSCI_3202/deepQNetwork/tf_train_breakout/breakout_model_20201128034415.h5'
f_num_episode = 100000
f_observe_step_num = 50000
f_epsilon_step_num = 1000000
f_refresh_target_model_num = 10000
f_replay_memory = 400000
f_no_op_steps = 30
f_regularizer_scale = 0.01
f_batch_size = 32
f_learning_rate = 0.00025
f_init_epsilon = 1.0
f_final_epsilon = 0.1
f_gamma = 0.99
f_resume = False
f_render = False

ATARI_SHAPE = (84, 84, 4)  # input image size to model
ACTION_SIZE = 3

def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

# 210*160*3(color) --> 84*84(mono)
# float --> integer (to reduce the size of replay memory)
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

def huber_loss(y, q_value):
    error = K.abs(y - q_value)
    quadratic_part = K.clip(error, 0.0, 1.0)
    linear_part = error - quadratic_part
    loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
    return loss


def atari_model():
    # With the functional API we need to define the inputs.
    frames_input = layers.Input(ATARI_SHAPE, name='frames')
    actions_input = layers.Input((ACTION_SIZE,), name='action_mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = layers.Lambda(lambda x: x / 255.0, name='normalization')(frames_input)

    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = layers.convolutional.Conv2D(
        16, (8, 8), strides=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = layers.convolutional.Conv2D(
        32, (4, 4), strides=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = layers.Dense(ACTION_SIZE)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = layers.Multiply(name='QValue')([output, actions_input])

    model = Model(inputs=[frames_input, actions_input], outputs=filtered_output)
    model.summary()
    optimizer = RMSprop(lr=f_learning_rate, rho=0.95, epsilon=0.01)
    # model.compile(optimizer, loss='mse')
    # to changed model weights more slowly, uses MSE for low values and MAE(Mean Absolute Error) for large values
    model.compile(optimizer, loss=huber_loss)
    return model


# get action from model using epsilon-greedy policy
# cj: annealing
def get_action(history, epsilon, step, model):
    if np.random.rand() <= epsilon or step <= f_observe_step_num:
        return random.randrange(ACTION_SIZE)
    else:
        q_value = model.predict([history, np.ones(ACTION_SIZE).reshape(1, ACTION_SIZE)])
    return np.argmax(q_value[0]) + 1


# save sample <s,a,r,s'> to the replay memory
def store_memory(memory, history, action, reward, next_history, dead):
    memory.append((history, action, reward, next_history, dead))


def get_one_hot(targets, nb_classes):
    # array for hot mapping each action
    # catj: todo: look at targets, classes, and shape (I think three classes)
    return np.eye(nb_classes)[np.array(targets).reshape(-1)]


# train model by radom batch
def train_memory_batch(memory, model, log_dir):
    mini_batch = random.sample(memory, f_batch_size)
    history = np.zeros((f_batch_size, ATARI_SHAPE[0],
                        ATARI_SHAPE[1], ATARI_SHAPE[2]))
    next_history = np.zeros((f_batch_size, ATARI_SHAPE[0],
                             ATARI_SHAPE[1], ATARI_SHAPE[2]))
    target = np.zeros((f_batch_size,))
    action, reward, dead = [], [], []

    # catj: memory is stored in:
        # catj: for indices 0 = history, 1 = action, 2 = reward, 3 = next_history, 4 = dead
    for idx, val in enumerate(mini_batch):
        history[idx] = val[0]
        next_history[idx] = val[3]
        action.append(val[1])
        reward.append(val[2])
        dead.append(val[4])

    actions_mask = np.ones((f_batch_size, ACTION_SIZE))
    # catj: predict for each action since mask is all 1s.
    next_Q_values = model.predict([next_history, actions_mask])

    # like Q Learning, get maximum Q value at s'
    # But from target model
    for i in range(f_batch_size):
        if dead[i]:
            target[i] = -1
            # target[i] = reward[i]
        else:
            # catj: Q(s, a) = r + gamma * max(Q(s', a'))
            target[i] = reward[i] + f_gamma * np.amax(next_Q_values[i])

    # catj get an action for each possible reward.
    action_one_hot = get_one_hot(action, ACTION_SIZE)
    # catj map each action to reward
    target_one_hot = action_one_hot * target[:, None]

    #tb_callback = TensorBoard(log_dir=log_dir, histogram_freq=0,
    #                          write_graph=True, write_images=False)

    ''''''
    h = model.fit(
        [history, action_one_hot], target_one_hot, epochs=1,
        batch_size=f_batch_size, verbose=0)
        #batch_size=FLAGS.batch_size, verbose=0, callbacks=[tb_callback])

    #if h.history['loss'][0] > 10.0:
    #    print('too large')

    return h.history['loss'][0]

def init_history(observe):
    # At start of episode, there is no preceding frame
    # So just copy initial states to make history
    # TODO state = preprocess(observe)
    state = pre_processing(observe)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))
    return history

def find_state_and_history(observe, history):
    next_state = pre_processing(observe)
    next_state = np.reshape([next_state], (1, 84, 84, 1))
    next_history = np.append(next_state, history[:, :, :, :3], axis=3)
    return (next_state, next_history)

def update_epsilon(global_step, epsilon):
    exploration = (global_step > f_observe_step_num)
    epsilon_declining = epsilon > f_final
    if epsilon_declining and exploration:
        epsilon -= epsilon_decay
    return epsilon

def q_iteration(env, model, state, iteration, memory):
    # Choose epsilon based on the iteration
    epsilon = update_epsilon(global_step, epsilon)

    # Choose the action 
    action = get_action(history, epsilon, global_step, model_target)

    # Play one game iteration (note: according to the next paper, you should actually play 4 times here)
    observe, reward, done, info = env.step(real_action)
    
    # state housekeeping
    next_state = pre_processing(observe)
    next_state = np.reshape([next_state], (1, 84, 84, 1))
    next_history = np.append(next_state, history[:, :, :, :3], axis=3)
    if start_life > info['ale.lives']:
        dead = True
        start_life = info['ale.lives']
    
    memory.append((history, action, reward, next_history, dead))
    
    has_reached_training_threshold = (global_step > f_observe_step_num)
    refresh_weights = (global_step % f_refresh_target_model_num == 0)
    
    if has_reached_training_threshold:
        model_loss = train_memory_batch(memory, model, log_dir)
        loss += model_loss
        if refresh_weights:
            model_target.set_weights(model.get_weights())
            
    score = score + reward
    return score

    # Sample and fit
#     batch = memory.sample_batch(32)
#     fit_batch(model, batch)
    
def log_stuff(model, global_step, episode_number, score, loss, step, memory, file_writer):
    if global_step <= f_observe_step_num:
        state = "observe"
    elif f_observe_step_num < global_step <= f_observe_step_num + f_epsilon_step_num:
        state = "explore"
    else:
        state = "train"
    if episode_number % 100 == 0:
        print('state: {}, episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}'
              .format(state, episode_number, score, global_step, loss / float(step), step, len(memory)))

    if episode_number % 100 == 0 or (episode_number + 1) == f_num_episode:
    #if episode_number % 1 == 0 or (episode_number + 1) == FLAGS.num_episode:  # debug
        now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
        file_name = "breakout_model_{}.h5".format(now)
        model_path = os.path.join(f_train_dir, file_name)
        model.save(model_path)

    # Add user custom data to TensorBoard
    loss_summary = tf.Summary(
        value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))])
    file_writer.add_summary(loss_summary, global_step=episode_number)

    score_summary = tf.Summary(
        value=[tf.Summary.Value(tag="score", simple_value=score)])
    if score >= 5:
        print('SCORE: ', score)
    file_writer.add_summary(score_summary, global_step=episode_number)


def train():
    env = gym.make('BreakoutDeterministic-v4')

    # deque: Once a bounded length deque is full, when new items are added,
    # a corresponding number of items are discarded from the opposite end
    memory = deque(maxlen=f_replay_memory)
    episode_number = 0
    epsilon = f_init_epsilon
    epsilon_decay = (f_init_epsilon - f_final_epsilon) / f_epsilon_step_num
    global_step = 0

    if f_resume:
        model = load_model(l_restore_file_path)
        # Assume when we restore the model, the epsilon has already decreased to the final value
        epsilon = l_final_epsilon
    else:
        model = atari_model()

    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    log_dir = "{}/run-{}-log".format(f_train_dir, now)
    file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())

    model_target = clone_model(model)
    model_target.set_weights(model.get_weights())

    while episode_number < f_num_episode:

        done = False
        dead = False
        # 1 episode = 5 lives
        step, score, start_life = 0, 0, 5
        loss = 0.0
        observe = env.reset()

        # this is one of DeepMind's idea.
        # just do nothing at the start of episode to avoid sub-optimal
        for _ in range(random.randint(1, f_no_op_steps)):
            observe, _, _, _ = env.step(1)
        history = init_history(observe)

        while not done:
            if f_render:
                env.render()
                time.sleep(0.01)
            #TODO q_iteration(env, model, state, iteration, memory)  START
            
            score = q_iteration(env, model, state, iteration, memory)

#             # get action for the current history and go one step in environment
#             # catj: history builds with each iteration based on next step which gets next
#             # state based on action.
#             action = get_action(history, epsilon, global_step, model_target)
#             # print('action: ', action)
#             # change action to real_action
#             # catj: TODO: move this to get action to offset action from set.
#             real_action = action + 1

#             # scale down epsilon, the epsilon only begin to decrease after observe steps
#             # catj: anneal epsilon to reduce the amount of exploration.
#             exploration = (global_step > f_observe_step_num)
#             epsilon_declining = epsilon > f_final
#             if epsilon > f_final_epsilon and exploration:
#                 epsilon -= epsilon_decay

#             observe, reward, done, info = env.step(real_action)
#             # pre-process the observation --> history
#             # TODO next_state = preprocess(observe)
#             next_state = pre_processing(observe)
#             next_state = np.reshape([next_state], (1, 84, 84, 1))
#             next_history = np.append(next_state, history[:, :, :, :3], axis=3)

#             # if the agent missed ball, agent is dead --> episode is not over
#             if start_life > info['ale.lives']:
#                 dead = True
#                 start_life = info['ale.lives']

#             # TODO: may be we should give negative reward if miss ball (dead)
#             # reward = np.clip(reward, -1., 1.)  # clip here is not correct

#             # save the statue to memory, each replay takes 2 * (84*84*4) bytes = 56448 B = 55.125 KB
#             store_memory(memory, history, action, reward, next_history, dead)  #

#             # check if the memory is ready for training
#             if global_step > f_observe_step_num:
#                 loss = loss + train_memory_batch(memory, model, log_dir)
#                 # if loss > 100.0:
#                 #    print(loss)
#                 if global_step % f_refresh_target_model_num == 0:  # update the target model
#                     model_target.set_weights(model.get_weights())

#             # TODO score = q_iteration(env, model, state, iteration, memory) END
#             score += reward
            
            if not dead:
                history = next_history

            # iterative house keeping resets for next iteration
            dead = False
            global_step += 1
            step += 1

            if done:
                log_stuff(model, global_step, episode_number, score, loss, step, memory, file_writer)
                episode_number += 1

    file_writer.close()


def test():
    env = gym.make('BreakoutDeterministic-v4')

    episode_number = 0
    epsilon = 0.001
    global_step = f_observe_step_num+1
    #model = load_model(f_restore_file_path)
    model = load_model(f_restore_file_path, custom_objects={'huber_loss': huber_loss})  # load model with customized loss func

    # test how to deep copy a model
    '''
    model_copy = clone_model(model)    # only copy the structure, not the value of the weights
    model_copy.set_weights(model.get_weights())
    '''

    while episode_number < f_num_episode:

        # init variables
        done = False
        dead = False
        # 1 episode = 5 lives
        score, start_life = 0, 5
        observe = env.reset()

        observe, _, _, _ = env.step(1)
        # At start of episode, there is no preceding frame
        # So just copy initial states to make history
        # TODO state = preprocess(observe)
        history = init_history(observe)

        while not done:
            env.render()
            time.sleep(0.01)

            # get action for the current history and go one step in environment
            action = get_action(history, epsilon, global_step, model)

            observe, reward, done, info = env.step(action)
            # pre-process the observation --> history
            next_state, next_history = find_state_and_history(observe, history)

            # if the agent has died it should be punished and the next life recorded.
            curr_life = info['ale.lives']
            new_life = start_life > curr_life
            if new_life:
                dead = True
                start_life = curr_life
                # added
                reward = -1

            reward = np.clip(reward, -1., 1.)

            score += reward

            # resets
            if not dead:
                history = next_history
            dead = False
            global_step += 1
            if done:
                episode_number += 1
                print('episode: {}, score: {}'.format(episode_number, score))


def main(argv=None):
    #train()
    test()


if __name__ == '__main__':
    tf.app.run()

/bin/sh: -c: line 0: syntax error near unexpected token `('
/bin/sh: -c: line 0: `caffeinate -t (3600 * 37)'


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


W1128 20:22:11.132919 4509007296 deprecation.py:323] From /Users/catherinejohnson/anaconda3/envs/tensorflow_env/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





W1128 20:22:11.411345 4509007296 deprecation_wrapper.py:119] From /Users/catherinejohnson/anaconda3/envs/tensorflow_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



episode: 1, score: 7.0


RecursionError: maximum recursion depth exceeded

In [9]:
print(os.getcwd())

/Users/catherinejohnson/Downloads/Project Announcement-20201115
