## Algorithm Design

### CNN

In [15]:
def cnn_model():
    return None

### Initialization Methods

In [16]:
def init_test_config():
    env = gym.make('BreakoutDeterministic-v4')
    player_games = 0
    episode_number = 0
    epsilon = 0.001
    global_step = NUM_OBSERVABLE_STEPS + 1
    model = load_model(LOGS_FILE_PATH, custom_objects={'huber_loss': huber_loss})
    return (env, player_games, episode_number, epsilon, global_step, model)

def init_model_clone(model):
    # Copy model since actual model weights will get updated later TODO.  when?
    # Clone model using keras api function.
    model_clone = clone_model(model)
    # Clone model weights to new model separately
    model_clone.set_weights(model.get_weights())
    return model_clone

def init_config():
    env = gym.make('BreakoutDeterministic-v4')
    # Deque is imported from collections.  Set to a finite size.  New memory will overwrite old.
    memory = deque(maxlen=400000)
    # init
    epsilon = 1.0
    # Calc decay rate.
    epsilon_decay = ((1.0 - 0.1) / 1000000)
    total_steps = 0
    # Init at 0.
    player_game = 0
    return (env, memory, epsilon, epsilon_decay, total_steps, player_game)

def init_game_config():
    done = False
    dead = False
    game_step = 0
    game_score = 0
    game_lives = 5
    game_loss = 0.0
    return (done, dead, game_step, game_score, game_lives, game_loss)


### Helper Methods

In [None]:
# Action is random if it is an observed state or if by chance based on the epsilon threshold, it is.
# If action is not random it gets generated from the current model based on history data to this point.
# I select the best action from this result.
def get_action(history, epsilon, model, is_in_observed_state):
    # is_in_observed_state = (step <= 1000)
    # is_in_observed_state = (step <= NUM_OBSERVABLE_STEPS)
    rand_choice_is_under_epsilon_threshold = (np.random.rand() <= epsilon)
    if rand_choice_is_under_epsilon_threshold or is_in_observed_state:
    #if is_in_observed_state:
        return random.randrange(ACTION_OPTION_COUNT)
    else:
        q_value = model.predict([history, np.ones(ACTION_SIZE).reshape(1, ACTION_SIZE)])
        #q_value = model.predict([history, get_one_hot_encoded_action_mask()()])
    # Offset for 0 indexing of one-hot encoding array location of value
    return np.argmax(q_value[0]) + 1

def update_epsilon(total_steps, epsilon, epsilon_decay):
    training = (total_steps > NUM_OBSERVABLE_STEPS)
    epsilon_declining = epsilon > 0.1
    if epsilon_declining and training:
        epsilon -= epsilon_decay
    return epsilon


### Debugging Method
It is not necessary to run this but the training is so long it is useful to check in on how it is performing.
Some iterative logging function should be run in case the model quits while the programmer is sleeping or something.

### Deep Q-Network Method

### Training Method

In [27]:
# # Mostly this function keeps track of system states, memory, and flags
# # It provides the opportunity to create logs for debugging
# # Most importantly it takes an action and updates a score.
# # It runs training on the model if all observation has been done.  This is Deep Q Learning.
def train():
    # Initialize global states.
    env, memory, epsilon, epsilon_decay, total_steps, player_games = init_config()
    # Initialize storage for collecting data on model performance.
    total_score = 0
    avg_game_scores = []
    # Get a copy of the cnn model with the architecture defined in a separate function.
    # model = cnn_model()
    # Initialize file writer for logging.  
    
    # The main model gets used in the Q learning training, and based on updated weights, 
    # then also updates the model clone.  
    # Targeted Network update.
    # model_clone = init_model_clone(model)
    
    # This is just a loop to cover the range of the global number of games played.
    # The player games number is kept visible to the program for logging purposes.
    while player_games < NUM_TURNS:
        # Define global game states.
        game_done, player_dead, game_step, game_score, game_lives, game_loss = init_game_config()
        # Reset the environment at the beginning of each game.
        observe = env.reset()

        # Prefill the start state, 4 frames.
        for _ in range(random.randint(1, INIT_NO_OP_STEPS)):
            observe, _, _, _ = env.step(1)
        # TODO

        while not game_done:
            
            # Epsilon has to decay a tiny bit with each iteration in the annealing method.
            epsilon = update_epsilon(total_steps, epsilon, epsilon_decay)

            # Get an action
            action = get_action(history, epsilon, model_clone, is_in_observed_state)

            # Take a step in the game
            observed_state, reward, game_done, info = env.step(action)
            
            # Update score based on agent action.
            game_score += game_reward
            
            # Preprocess result state image data and merge result state with history.
            
            # Update memory.
            memory.append((history, action, reward, next_history, player_dead))
            
            # Deep Q learning begins if the observational state is complete.
            # When the model has sufficiently recorded enough memory for training, start batch training.
            
            # Update counts and state flags.
            player_dead = False
            # These are used more of less for logging and aren't too important to the system.
            total_steps += 1
            game_step += 1
            
            if game_done:
                total_score += game_score
                player_games += 1
                # update average game score log
                if total_steps % 100 == 0:
                    avg_game_score = total_score/100
                    avg_game_scores = avg_game_scores + avg_game_score
                    total_score = 0
    file_writer.close()
    return avg_game_scores



### Test Method

In [28]:
def test():
    env, player_games, episode_number, epsilon, global_step, model = init_test_config()
    
    while player_games < NUM_TURNS:
        # init variables
       
        # Copy in initial states to amount to initial four frame history
        while not done:
            if RENDER:
                env.render()
                time.sleep(0.01)
                
            # Get action.
            action = get_action(history, epsilon, global_step, model)
            
            # Update epsilon.
            epsilon = update_epsilon(player_games, epsilon, epsilon_decay)

            # Take a step in the game.
            observed_state, reward, done, info = env.step(action)
            
            # Preprocess state to reduce image size, grayscale, and merge it with the history.
            next_state, next_history = find_state_and_history(observed_state)

            # Update Reward
            game_score += game_reward

            # Update counts and state flags.
            dead = False
            # This is used more of less for logging and aren't too important to the system.
            game_step += 1
            
            if game_done:
                player_games += 1
                print('Games Played: ', player_games)
                print('Score: ', game_score)


## Implementation

### Imports

In [29]:
%%capture
import gym
import random
import numpy as np
import tensorflow as tf

from collections import deque
import os.path
import matplotlib.pyplot as plt

### Constants

In [30]:
TRAIN_DIR = 'openai_breakout_training_storage'
LOGS_FILE_PATH = '/Users/catherinejohnson/Downloads/Project Announcement-20201115/openai_breakout_training_storage/training_20201128034747.h5'
# suggested by Deep Mind Paper
NUM_TURNS = 1000
#NUM_TURNS = 100000
# suggested by Deep Mind Paper
NUM_OBSERVABLE_STEPS = 500
# NUM_OBSERVABLE_STEPS = 50000
MODEL_WEIGHTS_REFRESH_THRESOLD = 100
# MODEL_WEIGHTS_REFRESH_THRESOLD = 10000
# suggested by Deep Mind Paper
INIT_NO_OP_STEPS = 10
# INIT_NO_OP_STEPS = 30
REGULATION_SCALE = 0.01
# suggested by Deep Mind Paper
BATCH_SIZE = 32
LEARNING_RATE = 0.00025
GAMMA = 0.99
# suggested by Deep Mind Paper
ATARI_IMAGE_SHAPE = (84, 84, 4)

# suggested by Deep Mind Paper
LAYER_1_SIZE = 16
LAYER_1_FILTER = (8, 8)
LAYER_1_STRIDES = (4, 4)
LAYER_2_SIZE = 32
LAYER_2_FILTER = (4, 4)
LAYER_2_STRIDES = (2, 2)
ACTIVATION_FUNCTION = 'relu'

In [31]:
avg_game_scores_sesh = train()

TypeError: only size-1 arrays can be converted to Python scalars

In [32]:
plt.plot(avg_game_scores_sesh)
print(avg_game_scores_sesh)
plt.plot([i for i, v enumerate(avg_game_scores_sesh)])
plt.title('plot')
plt.ylabel('avg scores')
plt.xlabel('time')
# plt.legend(['train', 'test'], loc='upper left')
plt.show()

SyntaxError: invalid syntax (<ipython-input-32-b7186f13edb0>, line 3)

In [None]:
# manually update file in RESTORE_FILE_PATH for testing.
test()