In [1]:
# getting in all the imports
from __future__ import print_function, division

import os
import time
import random
from collections import namedtuple
import matplotlib
import matplotlib.pyplot as plt
import scipy.ndimage
import numpy as np

import gym
import keras

# import tensorflow as tf


%matplotlib inline

Using TensorFlow backend.


In [2]:
# util functions
from PIL import Image
def time_it(some_function):

    """
    Outputs the time a function takes
    to execute.
    """

    def wrapper():
        t1 = time.time()
        some_function()
        t2 = time.time()
        return "Time it took to run the function: " + str((t2 - t1)) + "\n"
    return wrapper

In [3]:
# iterators

# Eplsilon Greedy Iterator
def select_action(state,iteration, num_actions, start_value = 1, end_value = 0.1, 
                        num_iterations = 5e6, decay = 0.1,repeat = False):
    
    if repeat:
        iteration %= num_iterations
        
    if iteration < num_iterations:
        epsilon = (iteration * ((end_value - start_value) / num_iterations) + start_value) * \
                    np.exp(-iteration*decay/num_iterations)
    else:
        epsilon = end_value * np.exp(-iteration*decay/num_iterations)
    
    if np.random.random() < epsilon:
        # Select a random action.
        action = np.random.randint(low=0, high=num_actions)
    else:
        # Otherwise select the action that has the highest Q-value.
        action = np.argmax(state)

    return action, epsilon
    

In [28]:
#### Setting the working and environment variables

# Game to play with
env_name = 'Breakout-v0'

# Create the game-environment using OpenAI Gym.
env = gym.make(env_name)

# The number of possible actions that the agent may take in every step.
num_actions = env.action_space.n

# getting actions names
# action_names = env.unwrapped.get_action_meanings()


# Height of each image-frame in the state.
state_height = 105

# Width of each image-frame in the state.
state_width = 80

# Number of images in the state.
state_channels = 2

# Number of images in the state.
state_channels = 4

# Shape of the state-array.
state_shape = [state_height, state_width, state_channels]

# Set replay memory size based on RAM to be used
replay_ram_space = 4

# getting replay size based on RAM space available
rp_size = int(np.ceil((replay_ram_space*1024*1024)/(state_height*state_width*state_channels*100))*100000)


[2017-12-20 22:53:11,016] Making new env: Breakout-v0


In [29]:
# Implementing Motion tracer for arbitary channels

def _pre_process_image(image,state_img_size):
    """Pre-process a raw image from the game-environment."""

    # Convert image to gray-scale.
    # Get the separate colour-channels.
    r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2]

    # Convert to gray-scale using the Wikipedia formula.
    img = 0.2990 * r + 0.5870 * g + 0.1140 * b

    # Resize to the desired size using SciPy for convenience.
    img = scipy.misc.imresize(img, size=state_img_size, interp='bicubic')

    return img


class MotionTracer:

    def __init__(self, image, decay=0.75, num_images=2, state_height=105, state_width = 80):
        """
        
        :param image:
            First image from the game-environment,
            used for resetting the motion detector.
        :param decay:
            Parameter for how long the tail should be on the motion-trace.
            This is a float between 0.0 and 1.0 where higher values means
            the trace / tail is longer.
        :param num_images:
            This is the array of image to be stored for detecting the motion 
            of the input
        """
        
        # Size of each image in the state.
        self.state_img_size = np.array([state_height, state_width])
        
        # Preprocessing the image
        img = _pre_process_image(image,self.state_img_size)
        
        # Initializing the imageset
#         self.imageset = np.zeros(shape=[num_images] + np.zeros_like(img), dtype=np.float)
        
        # initialising image for first set and rest will be zeros
#         self.imageset[num_images-1] = img.astype(np.float) 
        self.last_input = img.astype(np.float)

#         # Set the last output to zero.
#         for k in range(num_images-2):
#             self.imageset[k] = np.zeros_like(img) 
            
        self.last_output = np.zeros_like(img)
        
        # Storint the inputs to class
        self.decay = decay
        self.num_images = num_images

    def process(self, image):
        """Process a raw image-frame from the game-environment."""

        # Pre-process the image so it is gray-scale and resized.
        img = _pre_process_image(image,self.state_img_size)

        # Subtract the previous input. This only leaves the
        # pixels that have changed in the two image-frames.
        img_dif = img - self.last_input
        
#         [for k in reversed(range(1,num_images-1))]
            

        # Copy the contents of the input-image to the last input.
        self.last_input[:] = img[:]

        # If the pixel-difference is greater than a threshold then
        # set the output pixel-value to the highest value (white),
        # otherwise set the output pixel-value to the lowest value (black).
        # So that we merely detect motion, and don't care about details.
        img_motion = np.where(np.abs(img_dif) > 20, 255.0, 0.0)

        # Add some of the previous output. This recurrent formula
        # is what gives the trace / tail.
        output = img_motion + self.decay * self.last_output

        # Ensure the pixel-values are within the allowed bounds.
        output = np.clip(output, 0.0, 255.0)

        # Set the last output.
        self.last_output = output

        return output

    def get_state(self):
        """
        Get a state that can be used as input to the Neural Network.
        It is basically just the last input and the last output of the
        motion-tracer. This means it is the last image-frame of the
        game-environment, as well as the motion-trace. This shows
        the current location of all the objects in the game-environment
        as well as trajectories / traces of where they have been.
        """

        # Stack the last input and output images.
        state = np.dstack([self.last_input, self.last_output])

        # Convert to 8-bit integer.
        # This is done to save space in the replay-memory.
        state = state.astype(np.uint8)

        return state

In [30]:
# Implementig cart screen getter
screen_width = 600


def get_cart_location(env):
    world_width = env.x_threshold * 2
    scale = screen_width / world_width
    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART

def get_screen(env):
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))  # transpose into torch order (CHW)
    # Strip off the top and bottom of the screen
    screen = screen[:, 160:320]
    view_width = 320
    cart_location = get_cart_location(env)
    if cart_location < view_width // 2:
        slice_range = slice(view_width)
    elif cart_location > (screen_width - view_width // 2):
        slice_range = slice(-view_width, None)
    else:
        slice_range = slice(cart_location - view_width // 2,
                            cart_location + view_width // 2)
    # Strip off the edges, so that we have a square image centered on a cart
    screen = screen[:, :, slice_range]
    # Convert to float, rescare, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return _pre_process_image(screen,np.array([600, 300]))

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [31]:

from keras.layers import Conv2D, Dense,Flatten, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad

def buildmodel(num_actions, batch_size, img_rows=105, img_cols=80, img_channels = 2):
    
    init = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.02, seed=None)
    
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=3, strides=2, padding = 'same',activation = 'relu',
                    kernel_initializer = init,data_format="channels_last",
                     input_shape=(img_rows, img_cols, img_channels)))
    model.add(Conv2D(filters=32, kernel_size=3, strides=2, padding = 'same',activation = 'relu',
                    kernel_initializer = init))
    model.add(Conv2D(filters=64, kernel_size=2, strides=1, padding = 'same',activation = 'relu',
                    kernel_initializer = init))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu',kernel_initializer = init))
    model.add(Dense(1024, activation='relu',kernel_initializer = init))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu',kernel_initializer = init))
    model.add(Dense(512, activation='relu',kernel_initializer = init))
    model.add(Dense(num_actions,activation=None))
   
    adam = Adagrad(lr=1e-6)
    model.compile(loss='mse',optimizer=adam)
    print("Model is built")
    return model

In [32]:
class ExperienceReplay(object):
    
    def __init__(self, max_memory=100, discount=.9):
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?]
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_batch(self, model, batch_size=10):
        
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]
        env_dim = [self.memory[0][0][0].shape[0], self.memory[0][0][0].shape[1], self.memory[0][0][0].shape[2]]
        
        
#         inputs = np.zeros((min(len_memory, batch_size), env_dim))
        inputs = np.zeros(shape=[min(len_memory, batch_size)] + env_dim, dtype=np.uint8)
        temp_input = np.zeros(shape=[min(len_memory, batch_size)] + env_dim, dtype=np.uint8)
#         print(inputs.shape)
        targets = np.zeros((min(len_memory, batch_size), num_actions))
        
        for i, idx in enumerate(np.random.randint(0, len_memory,size=inputs.shape[0])):
            
#             print(len(self.memory[idx][0]))
#             print(self.memory[idx][0][3])
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]
            
#             print(state_t.shape)
#             print(state_tpl.shape)

            inputs[i:i+1] = state_t
            temp_input[i:i+1] = self.memory[idx][0][3]
            
            # There should be no target values for actions not taken.
            # Thou shalt not correct actions not taken #deep
            targets[i] = model.predict(inputs)[0]
            Q_sa = np.max(model.predict(temp_input)[0])
            
            if game_over:  # if game_over is True
                targets[i, action_t] = reward_t
            else:
                # reward_t + gamma * max_a' Q(s', a')
                targets[i, action_t] = reward_t + self.discount * Q_sa
        return inputs, targets

    

    
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [33]:
# env_initate()
# model_load()
# def run():
#     env_execute()
#     train_model()
#     update_q_values()
#     save_mode()
    

In [36]:
def run_dqn(env,model,memory,episodes,iterations,batch_size = 32):
    
    num_action = env.action_space.n
#     env.reset()
    win_cnt = 0
    
    for e in range(episodes):
        loss = []
#         env.reset()
        game_over = False
        
        # get initial input
#         screen = env.render(mode='rgb_array')
#         screen = 
        env.reset()
        try:
            screen = env.render(mode='rgb_array')
        except:
            screen=env.reset()
            print(screen.shape)
#         get_screen(env)
        motion_tracer = MotionTracer(screen)
        
        netReward = 0
        for itr in range(iterations):
            state = motion_tracer.get_state()
            action,_ = select_action(state,itr,num_action)
            img, reward, game_over, info = env.step(action=action)
            
            if not game_over:
                motion_tracer.process(img)
                next_state = motion_tracer.get_state()
            else:
                motion_tracer.process(np.zeros_like(state))
                next_state = motion_tracer.get_state()
            
            # Adding to replay memory
            memory.remember([state, action, reward, next_state], game_over)

            # adapt model
            inputs, targets = memory.get_batch(model, batch_size=batch_size)
            loss.append(model.train_on_batch(inputs, targets))
            if len(loss) % 10 == 0:
                 print("Episode {:02d} | Iteration {:03d} | Loss {:.8f} | Win count {}".format(
                     e, itr,np.mean(loss), netReward))
#             model.summary()
#             loss += model.train_on_batch(inputs, targets)[0]
            

            netReward += reward
            env.render()
#             time.sleep(0.01)
#             get_screen(env)
#             plt.figure()
# #             plt.imshow(get_screen(env).cpu().squeeze(0).permute(1, 2, 0).numpy(),
# #                        interpolation='none')
#             plt.imshow(env.render())
#             plt.title('Example extracted screen')
#             plt.show()
        print("Episode {:02d} | Iteration {:03d} | Loss {:.8f} | Win count {}".format(
                     e, itr,np.mean(loss), netReward))
            
    # Save trained model weights and architecture, this will be used by the visualization code
    model.save_weights("model.h5", overwrite=True)
    with open("model.json", "w") as outfile:
        json.dump(model.to_json(), outfile)

In [None]:
# exp_replay = ExperienceReplay(max_memory=rp_size)
model = buildmodel(num_actions,32)

memory = ExperienceReplay(rp_size)
# memory = ReplayMemory(rp_size)

run_dqn(env,model,memory,episodes = 10,iterations =  100,batch_size = 32 )

Model is built


`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 00 | Iteration 009 | Loss 0.00019140 | Win count 0.0
Episode 00 | Iteration 019 | Loss 0.00019742 | Win count 0.0
Episode 00 | Iteration 029 | Loss 0.00018928 | Win count 0.0
Episode 00 | Iteration 039 | Loss 0.00018398 | Win count 0.0
Episode 00 | Iteration 049 | Loss 0.00017931 | Win count 0.0
