In [1]:
#grid_walker.ipynb
#
#by Joe Hahn
#jmh.datasciences@gmail.com
#3 February 2018
#
#grid_walker uses Q-learning to teach a neural net AI how to navigate an agent about
#a 6x6 grid, guiding it towards a goal while avoiding obstacles and hazards

In [2]:
#game settings
grid_size = 6             #the grid_walker game spans a 6x6 grid of cells
init = 'random_agent'     #the grid_walker agent is to be placed in a random location in this grid

In [3]:
#seed random number generator
rn_seed = 15
import numpy as np
np.random.seed(rn_seed)

In [4]:
#import the grid_walker game
from grid_walker import *

In [5]:
#create the environment, which is just a dictionary that contains all the
#parameters describing the grid_walker game
environment = initialize_environment(grid_size, init)
environment

{'actions': [0, 1, 2, 3],
 'acts': ['up', 'down', 'left', 'right'],
 'grid_size': 6,
 'init': 'random_agent',
 'max_moves': 36,
 'objects': ['agent', 'goal', 'pit', 'wall']}

In [6]:
#generate the system's initial state, which is another dict containing
#the x,y coordinates of all objects in the system
state = initialize_state(environment)
state

{'agent': {'x': 0, 'y': 5},
 'goal': {'x': 4, 'y': 4},
 'pit': {'x': 4, 'y': 2},
 'wall': {'x': 1, 'y': 4}}

In [7]:
#This system is comprised of 4 objects. The locations of the goal, pit, and wall are constant
#while the agent is mobile and can roam about the 6x6 grid
objects = environment['objects']
objects

['agent', 'goal', 'pit', 'wall']

In [8]:
#the agent has 4 possible actions: it can move up when action=0, down (action=1),
#left (action=2), or right (action=3)
actions = environment['actions']
acts = environment['acts']
print 'actions = ', actions
print 'acts = ', acts

actions =  [0, 1, 2, 3]
acts =  ['up', 'down', 'left', 'right']


In [9]:
#a 6x6 numpy array is used to indicte the x,y locations of agent A, wall W, pit P, and goal G.
#Note that a nmpy array printed to the screen will appear to have its x,y axes flipped,
#so a transpose and rotation of the grid array is used to undo that flip.
#Here the agent is also manually placed at x=3,y=3 for convenience
state['agent'] = {'x':3, 'y':3}
grid = make_grid(state, environment)
print np.rot90(grid.T)
print 'state:'
print '    ', state

[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' 'A' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
state:
     {'wall': {'y': 4, 'x': 1}, 'pit': {'y': 2, 'x': 4}, 'goal': {'y': 4, 'x': 4}, 'agent': {'y': 3, 'x': 3}}


In [10]:
#Now move the agent one grid-cell to the right, and note that this move generated a reward of -1:
action = 3
print 'act = ', acts[action]
next_state = move_agent(state, action, environment)
grid = make_grid(next_state, environment)
print np.rot90(grid.T)
print 'next_state:'
print '    ', next_state
print 'reward = ', get_reward(next_state, state)
state = next_state
N_moves = 1
print 'game_state = ', get_game_state(state, N_moves, environment)

act =  right
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' '' 'A' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
next_state:
     {'wall': {'y': 4, 'x': 1}, 'pit': {'y': 2, 'x': 4}, 'goal': {'y': 4, 'x': 4}, 'agent': {'y': 3, 'x': 4}}
reward =  -1
game_state =  running


In [11]:
#walk the agent up to goal G (which is then displayed as *), that move generates a reward of 10
#Note that the game_state has now changed to 'goal' to signify the end of a game.
action = 0
print 'act = ', acts[action]
next_state = move_agent(state, action, environment)
grid = make_grid(next_state, environment)
print np.rot90(grid.T)
print 'next_state:'
print '    ', next_state
print 'reward = ', get_reward(next_state, state)
state = next_state
N_moves += 1
print 'game_state = ', get_game_state(state, N_moves, environment)

act =  up
[['' '' '' '' '' '']
 ['' 'W' '' '' '*' '']
 ['' '' '' '' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
next_state:
     {'wall': {'y': 4, 'x': 1}, 'pit': {'y': 2, 'x': 4}, 'goal': {'y': 4, 'x': 4}, 'agent': {'y': 4, 'x': 4}}
reward =  10
game_state =  goal


In [12]:
#Now walk the agent into pit P (now displayed as @) and generate reward of -10
action = 1
print 'act = ', acts[action]
next_state = move_agent(state, action, environment)
state = next_state
print 'act = ', acts[action]
next_state = move_agent(state, action, environment)
grid = make_grid(next_state, environment)
print np.rot90(grid.T)
print 'next_state:'
print '    ', next_state
print 'reward = ', get_reward(next_state, state)
state = next_state

act =  down
act =  down
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '@' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
next_state:
     {'wall': {'y': 4, 'x': 1}, 'pit': {'y': 2, 'x': 4}, 'goal': {'y': 4, 'x': 4}, 'agent': {'y': 2, 'x': 4}}
reward =  -10


In [13]:
#the grid_walker game does not let the agent walk into wall W or beyond the 6x6 grid,
#and such moves generate a reward of -3

In [14]:
#A grid_walker game terminates when the agent walks into goal G, pit P, or walks for more
#than grid_size**2 = 36 moves

In [15]:
#assemble neural network that agent will use to navigate the grid. The simple neural
#network used here has two dense hidden layers having grid_size**2 = 36 neurons in each layer
#When trained, this model uses epsilon-greedy Q-learning to estimate which of the agent's
#four possible moves (up down left or right) that best maximizes the agent's future rewards.
state_vector = state2vector(state, environment)
N_inputs = state_vector.shape[1]
N_outputs = len(actions)
model = build_model(N_inputs, grid_size, N_outputs)
print model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 36)                108       
_________________________________________________________________
activation_1 (Activation)    (None, 36)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 36)                1332      
_________________________________________________________________
activation_2 (Activation)    (None, 36)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 148       
_________________________________________________________________
activation_3 (Activation)    (None, 4)                 0         
Total params: 1,588
Trainable params: 1,588
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
#Use Q-learning to train neural network to walk agent A towards goal G...for details see
#the train() function in grid_walker.py...about two minutes to execute.
N_training_games = 1000                #number of games to play while training model
gamma = 0.85                           #discount for future rewards
memories_size = 300                    #size of memory queue size, for experience replay
batch_size = memories_size/15          #number of memories to use when fitting the model
debug = False                          #set to True to grid the last move of every training game
print 'training model...'
trained_model = train(environment, model, N_training_games, gamma, memories_size, batch_size, debug=debug)
print 'training done.'

training model...
training done.


In [17]:
#test the trained neural network by using it to guide the Agent from an initial random location to goal G,
#with each step gridded along the way
display_stats = True
initial_state, final_state, N_moves, game_state = test_model(trained_model, environment, display_stats=display_stats)

initial state:
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' '' '' '']
 ['' '' '' '' 'P' '']
 ['' '' 'A' '' '' '']
 ['' '' '' '' '' '']]
 move : 1    action: up
reward: -1
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' '' '' '']
 ['' '' 'A' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
 move : 2    action: up
reward: -1
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' 'A' '' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
 move : 3    action: right
reward: -1
[['' '' '' '' '' '']
 ['' 'W' '' '' 'G' '']
 ['' '' '' 'A' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
 move : 4    action: up
reward: -1
[['' '' '' '' '' '']
 ['' 'W' '' 'A' 'G' '']
 ['' '' '' '' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
 move : 5    action: right
reward: 10
[['' '' '' '' '' '']
 ['' 'W' '' '' '*' '']
 ['' '' '' '' '' '']
 ['' '' '' '' 'P' '']
 ['' '' '' '' '' '']
 ['' '' '' '' '' '']]
game_state: goal


In [18]:
#run multiple tests until agent has started from all possible starting positions
display_stats = False
state = initialize_state(environment)
game_states = make_grid(state, environment)
while (game_states == '').sum():
    initial_state, final_state, N_moves, game_state = \
        test_model(trained_model, environment, display_stats=display_stats)
    agent = initial_state['agent']
    x = agent['x']
    y = agent['y']
    game_states[y, x] = game_state[0]

In [19]:
#cells containing 'g' indicate that agent successfully navigated to Goal from those starting points,
#while cells containing 'm' indicate where agent fails to find Goal before game ends due to 'max_moves',
#while 'p' means agent bundered into pit P from those initial cells.
print np.rot90(game_states.T)

[['g' 'g' 'g' 'g' 'g' 'g']
 ['m' 'W' 'g' 'g' 'G' 'g']
 ['g' 'g' 'g' 'g' 'g' 'g']
 ['g' 'g' 'g' 'g' 'P' 'g']
 ['g' 'g' 'g' 'g' 'p' 'p']
 ['g' 'g' 'g' 'g' 'g' 'g']]


In [20]:
#rebuild, retrain, and retest the model
model = build_model(N_inputs, grid_size, N_outputs)
N_training_games = 3000
print 'training...'
trained_model = train(environment, model, N_training_games, gamma, memories_size, batch_size, debug=debug)
state = initialize_state(environment)
game_states = make_grid(state, environment)
while (game_states == '').sum():
    initial_state, final_state, N_moves, game_state = \
        test_model(trained_model, environment, display_stats=display_stats)
    agent = initial_state['agent']
    x = agent['x']
    y = agent['y']
    game_states[y, x] = game_state[0]
print np.rot90(game_states.T)

training...
[['g' 'g' 'g' 'g' 'g' 'g']
 ['g' 'W' 'g' 'g' 'G' 'g']
 ['g' 'g' 'g' 'g' 'g' 'g']
 ['g' 'g' 'g' 'g' 'P' 'g']
 ['g' 'g' 'g' 'g' 'g' 'g']
 ['g' 'g' 'g' 'g' 'g' 'g']]
