In [1]:
# Importing everything at once

from __future__ import print_function, division
import numpy as np
import tensorflow as tf
import gym
import scipy.ndimage
# import skimage.
import sys
import os
import time
import csv
import math
# import argparse
# import download
import matplotlib.pyplot as plt
import keras
%matplotlib inline


Using TensorFlow backend.


In [2]:
os.getcwd()

'/media/jitin/Data/Misc/Exploration/Implementations'

In [5]:
# Setting Environment to play with

env_name = 'Breakout-v0'

# Setting location of checkpoints
# Note this directory will also take model backup for the chekpoint training
checkpoint_base_dir = 'checkpoints/'

# Combination of base-dir and environment-name.
checkpoint_dir = None

# Full path for the log-file for rewards.
log_reward_path = None

# Full path for the log-file for Q-values.
log_q_values_path = None


# Resetting values based on inputs
if checkpoint_dir is None:
    checkpoint_dir = os.path.join(checkpoint_base_dir, env_name)
else:
    checkpoint_dir = os.path.join(checkpoint_base_dir, '{}/{}'.format(checkpoint_dir,env_name))
    
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Setting logReward path
if log_reward_path is None:
    log_reward_path = os.path.join(checkpoint_dir, "log_reward.txt")
else:
    log_reward_path = os.path.join(checkpoint_dir, log_reward_path)
    

# File-path for the log-file for Q-values.
if log_q_values_path is None:
    log_q_values_path = os.path.join(checkpoint_dir, "log_q_values.txt")
else:
    log_q_values_path = os.path.join(checkpoint_dir, log_q_values_path)
    


In [6]:
#### Setting the working and environment variables

# Create the game-environment using OpenAI Gym.
env = gym.make(env_name)

# The number of possible actions that the agent may take in every step.
num_actions = env.action_space.n
# num_actions = 4

# Whether we are training (True) or testing (False).
training = True

# Whether to render each image-frame of the game-environment to screen.
render = False

# Whether to use logging during training.
use_logging = False

# Height of each image-frame in the state.
state_height = 105

# Width of each image-frame in the state.
state_width = 80

# Size of each image in the state.
state_img_size = np.array([state_height, state_width])

# Number of images in the state.
state_channels = 4

# Shape of the state-array.
state_shape = [state_height, state_width, state_channels]

# Set replay memory size based on RAM to be used
replay_ram_space = 4

# getting replay size based on RAM space available
rp_size = int(np.ceil((replay_ram_space*1024*1024)/(state_height*state_width*state_channels*100))*100000)



In [7]:
# Setting Replay memory Size, States, Q_atates and outcomes

# Array for the previous states of the game-environment.
states = np.zeros(shape=[rp_size] + state_shape, dtype=np.uint8)

# Array for the Q-values corresponding to the states.
q_values = np.zeros(shape=[rp_size, num_actions], dtype=np.float)

# Array for the Q-values before being updated.
# This is used to compare the Q-values before and after the update.
q_values_old = np.zeros(shape=[rp_size, num_actions], dtype=np.float)

# Actions taken for each of the states in the memory.
actions = np.zeros(shape=rp_size, dtype=np.int)

# Rewards observed for each of the states in the memory.
rewards = np.zeros(shape=rp_size, dtype=np.float)

# Whether the life had ended in each state of the game-environment.
end_lifes = np.zeros(shape=rp_size, dtype=np.bool)

# Whether the episode had ended (aka. game over) in each state.
end_episodes = np.zeros(shape=rp_size, dtype=np.bool)

# Estimation errors for the Q-values. .
estimation_errors = np.zeros(shape=rp_size, dtype=np.float)

# Discount-factor for calculating Q-values.
discount_factor = 0.97

# Reset the number of used states in the replay-memory.
replay_num_used = 0

# Threshold for splitting between low and high estimation errors.
error_threshold = 0.1


# Defining list for replay_memory
# This Method might be time consuming but is easy to comprehend
obj = [state_shape, q_values, action, reward, end_life, end_episode]
replay_memory = []

# Defining functions that will play with replay memory ;)
def add_to_replay(state, q_value, action, reward, end_life, end_episode):
       
    # Calling globals varaibels and manuplating them
    global replay_num_used
    global rp_size
    global states
    global q_values
    global actions
    global end_lifes
    global end_episodes
    global rewards
    
    # if replay memory is not full we can add the new values
    if replay_num_used != rp_size:
            # Index into the arrays for convenience.
            k = replay_num_used

            # Increase the number of used elements in the replay-memory.
            replay_num_used += 1

            # Store all the values in the replay-memory.
            states[k] = state
            q_values[k] = q_value
            actions[k] = action
            end_lifes[k] = end_life
            end_episodes[k] = end_episode

            # Note that the reward is limited. This is done to stabilize
            # the training of the Neural Network.
            rewards[k] = np.clip(reward, -1.0, 1.0)

def update_q_values():
    # Calling globals varaibels and manuplating them
    global replay_num_used
    global rp_size
    global states
    global q_values
    global q_values_old
    global actions
    global end_lifes
    global end_episodes
    global rewards
    global discount_factor
    global estimation_errors
    
    # taking backup of q_values
    q_values_old[:] = q_values[:]
    
    # updating q_value in reverse order
    for k in reversed(range(replay_num_used-1)):
        action = actions[k]
        reward = rewards[k]
        end_life = end_lifes[k]
        end_episode = end_episodes[k]
        if end_life or end_episode:
            action_value = reward
        else:
            action_value = reward + discount_factor * np.max(q_values[k + 1])
        estimation_errors[k] = abs(action_value - q_values[k, action])
        q_values[k, action] = action_value
#         print_statistics()

def get_replay_random_sample(repaly_memory,num_used,batch=64,samples=0.5):
        
    # getting indexes
    indexes = np.arange(replay_num_used)
    
    # gettting random indexes
    idx = np.random.choice(indexs,size=np.ceil(len(indexes)*samples),replace=False)
    
    return states[idx],q_values[idx]
    
def get_replay_stratified_sample(samples = 0.5):
    
    #TODO: To be implemented
    
    # including globals
    global states
    global q_values
    global replay_num_used
    
    # getting indexes
    indexes = np.arange(replay_num_used)
    
    # gettting random indexes
    idx = np.random.choice(indexs,size=np.ceil(len(indexes)*samples),replace=False)
    
    return states[idx],q_values[idx]

def estimate_q_values(model,batch_size=128):
    
    # including globals
    global states
    global q_values
    global replay_num_used
    
    begin = 0
    # this implementation is for doing iterative calculations n batch mode
    while begin < replay_num_used:
        # Setting the end index for the batch run
        end = begin + batch_size
        
        # Ensure the batch does not exceed the used replay-memory.
        if end > replay_num_used:
            end = replay_num_used
        
        # Scoring q vlaues from the model
        q_values[begin:end] = model.get_q_values(states[begin:end])
        
        # Set the start-index for the next batch to the end of this batch.
        begin = end

        

In [8]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [9]:
# Defining Learning Rate, Loss Limit, Max Epochs, Replay Fraction and Epsilon Decay Greedy Iterators

# Learning Rate iterator
def learning_rate_itr(iteration, start_value = 1e-3, end_value = 1e-5, num_iterations = 5e6, repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        value = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        value = end_value

    return value

# Loss Limit Iterator
def loss_limit_itr(iteration, start_value = 0.1, end_value = 0.015, num_iterations = 5e6, repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        value = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        value = end_value

    return value

# Max Epochs Iterator
def max_epoch_itr(iteration, start_value = 5, end_value = 10, num_iterations = 5e6, repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        value = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        value = end_value

    return value

# Replay Fraction Iterator
def replay_frac_itr(iteration, start_value = 0.1, end_value = 1, num_iterations = 5e6, repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        value = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        value = end_value

    return value

# Eplsilon Greedy Iterator
def eplsilon_greedy_itr(iteration, num_actions, q_values,start_value = 1, end_value = 0.1, 
                        num_iterations = 5e6, repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        epsilon = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        epsilon = end_value
    
    if np.random.random() < epsilon:
        # Select a random action.
        action = np.random.randint(low=0, high=num_actions)
    else:
        # Otherwise select the action that has the highest Q-value.
        action = np.argmax(q_values)

    return action, epsilon


# Eplsilon Decay Iterator
def eplsilon_decay_itr(iteration, num_actions, q_values, decay, start_value = 1, end_value = 0.1, 
                        num_iterations = 5e6, repeat = False):
    
    # TODO: Implement Decay factor
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        epsilon = iteration * ((end_value - start_value) / num_iterations) + start_value
    else:
        epsilon = end_value
    
    if np.random.random() < epsilon:
        # Select a random action.
        action = np.random.randint(low=0, high=num_actions)
    else:
        # Otherwise select the action that has the highest Q-value.
        action = np.argmax(q_values)

    return action, epsilon

In [18]:
# Implementing Motion tracer for arbitary channels
# Also removing comments fo reasy comprehension

def _rgb_to_grayscale(image):
    """
    Convert an RGB-image into gray-scale using a formula from Wikipedia:
    https://en.wikipedia.org/wiki/Grayscale
    """

    # Get the separate colour-channels.
    r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]

    # Convert to gray-scale using the Wikipedia formula.
    img_gray = 0.2990 * r + 0.5870 * g + 0.1140 * b

    return img_gray


def _pre_process_image(image):
    """Pre-process a raw image from the game-environment."""

    # Convert image to gray-scale.
    img = _rgb_to_grayscale(image)

    # Resize to the desired size using SciPy for convenience.
    img = scipy.misc.imresize(img, size=state_img_size, interp='bicubic')

    return img


class MotionTracer:

    def __init__(self, image, decay=0.75, num_images=4):
        """
        
        :param image:
            First image from the game-environment,
            used for resetting the motion detector.
        :param decay:
            Parameter for how long the tail should be on the motion-trace.
            This is a float between 0.0 and 1.0 where higher values means
            the trace / tail is longer.
        :param num_images:
            This is the array of image to be stored for detecting the motion 
            of the input
        """
        
        # Preprocessing the image
        img = _pre_process_image(image=image)
        
        # Initializing the imageset
        self.imageset = np.zeros(shape=[num_images] + np.zeros_like(img), dtype=np.float)
        
        # initialising image for first set and rest will be zeros
        self.imageset[num_images-1] = img.astype(np.float) 
        self.last_input = img.astype(np.float)

        # Set the last output to zero.
        for k in range(num_images-2):
            self.imageset[k] = np.zeros_like(img) 
            
        self.last_output = np.zeros_like(img)
        
        # Storint the inputs to class
        self.decay = decay
        self.num_images = num_images

    def process(self, image):
        """Process a raw image-frame from the game-environment."""

        # Pre-process the image so it is gray-scale and resized.
        img = _pre_process_image(image=image)

        # Subtract the previous input. This only leaves the
        # pixels that have changed in the two image-frames.
        img_dif = img - self.last_input
        
#         [for k in reversed(range(1,num_images-1))]
            

        # Copy the contents of the input-image to the last input.
        self.last_input[:] = img[:]

        # If the pixel-difference is greater than a threshold then
        # set the output pixel-value to the highest value (white),
        # otherwise set the output pixel-value to the lowest value (black).
        # So that we merely detect motion, and don't care about details.
        img_motion = np.where(np.abs(img_dif) > 20, 255.0, 0.0)

        # Add some of the previous output. This recurrent formula
        # is what gives the trace / tail.
        output = img_motion + self.decay * self.last_output

        # Ensure the pixel-values are within the allowed bounds.
        output = np.clip(output, 0.0, 255.0)

        # Set the last output.
        self.last_output = output

        return output

    def get_state(self):
        """
        Get a state that can be used as input to the Neural Network.
        It is basically just the last input and the last output of the
        motion-tracer. This means it is the last image-frame of the
        game-environment, as well as the motion-trace. This shows
        the current location of all the objects in the game-environment
        as well as trajectories / traces of where they have been.
        """

        # Stack the last input and output images.
        state = np.dstack([self.last_input, self.last_output])

        # Convert to 8-bit integer.
        # This is done to save space in the replay-memory.
        state = state.astype(np.uint8)

        return state

In [None]:
# Now we will build the neural network architecture with keras

from keras.layers import Conv2D, Dense,Flatten
from keras.models import Sequential

def neural_net(state_shape, num_actions):
    
    x = tf.placeholder(dtype=tf.float32, shape=[None] + state_shape, name='x')

    # Placeholder variable for inputting the learning-rate to the optimizer.
    learning_rate = tf.placeholder(dtype=tf.float32, shape=[])

    # Placeholder variable for inputting the target Q-values
    # that we want the Neural Network to be able to estimate.
    q_values_new = tf.placeholder(tf.float32, shape=[None, num_actions], name='q_values_new')

    # This is a hack that allows us to save/load the counter for
    # the number of states processed in the game-environment.
    # We will keep it as a variable in the TensorFlow-graph
    # even though it will not actually be used by TensorFlow.
    count_states = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64, name='count_states')

    # Similarly, this is the counter for the number of episodes.
    count_episodes = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64, name='count_episodes')

    # TensorFlow operation for increasing count_states.
    count_states_increase = tf.assign(self.count_states, self.count_states + 1)

    # TensorFlow operation for increasing count_episodes.
    count_episodes_increase = tf.assign(self.count_episodes, self.count_episodes + 1)
    
    # Now we will build the architecture

    # initializing the network with initializer
    init = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.02, seed=None)
    
    # Making the network
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=3, strides=2, padding = 'same',activation = 'relu',
                    kernel_initializer = init))
    model.add(Conv2D(filters=32, kernel_size=3, strides=2, padding = 'same',activation = 'relu',
                    kernel_initializer = init))
    model.add(Conv2D(filters=64, kernel_size=3, strides=1, padding = 'same',activation = 'relu',
                    kernel_initializer = init))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu',kernel_initializer = init))
    model.add(Dense(1024, activation='relu',kernel_initializer = init))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu',kernel_initializer = init))
    model.add(Dense(512, activation='relu',kernel_initializer = init))
    model.add(Dense(num_actions,activation=None))
    
    
   
    
    
    
    

In [20]:
2e-2

0.02