# Gridworld Construction
This notebook tests the implementation of a gridworld environment using numpy.

## Initial setup

In [178]:
import numpy as np
from copy import copy
from random import random

## Define states

In [100]:
def coord_to_idx(*coordinates):
    # Convert to [[y1, y2, ...], [x1, x2, ...]]
    idx = np.zeros([2, len(coordinates)], dtype=np.int32)
    for i, c in enumerate(coordinates):
        idx[:, i] = c
    return idx

In [166]:
def coord_to_state(coord):
    # Convert [y1, x1] to s1
    if coord.ndim == 1:
        return (coord[0]*width + coord[1]).astype(np.int16)
    
    # Convert [[y1, x1], ..., [yn, xn]] to [s1, ..., sn]
    else:
        return (coord[:, 0]*width + coord[:, 1]).astype(np.int16)

In [153]:
height, width = 8, 8
states = np.zeros([height*width], dtype=np.int16)

In [154]:
starts = np.array([[0, 0]])
goals  = np.array([[0, 7]])
walls  = np.vstack([np.vstack([np.arange(5, 8), np.ones([3])*2]).T,
                    np.vstack([np.arange(5), np.ones([5])*5]).T])
holes = np.array([[2, 1], 
                  [3, 3], 
                  [7, 4],
                  [3, 7]])

In [155]:
states[coord_to_state(starts)] = 1
states[coord_to_state(goals)] = 2
states[coord_to_state(walls)] = 3
states[coord_to_state(holes)] = 4
states

array([1, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 3, 0,
       0, 0, 0, 0, 4, 0, 3, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0], dtype=int16)

In [156]:
states.reshape([height, width])

array([[1, 0, 0, 0, 0, 3, 0, 2],
       [0, 0, 0, 0, 0, 3, 0, 0],
       [0, 4, 0, 0, 0, 3, 0, 0],
       [0, 0, 0, 4, 0, 3, 0, 4],
       [0, 0, 0, 0, 0, 3, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 4, 0, 0, 0]], dtype=int16)

In [168]:
def is_terminal_state(pos):
    s = states[coord_to_state(pos)]
    
    if (s == 2) or (s == 4):
        return True
    else:
        return False

## Define actions

In [199]:
def make_action(action, pos):
    # Create placeholde for new position
    new_pos = copy(pos)
    
    # Modify according to action
    if action == 0: # move left
        new_pos[1] = max(pos[1] - 1, 0)
    elif action == 1: # move down
        new_pos[0] = min(pos[0] + 1, width - 1)
    elif action == 2: # move right
        new_pos[1] = min(pos[1] + 1, height - 1)
    elif action == 3: # move up
        new_pos[0] = max(pos[0] - 1, 0)
    else:
        raise ValueError("Undefined action index %d" % action)
    
    # Check if wall exists
    if states[coord_to_state(new_pos)] == 3:
        return pos
    else:
        return new_pos

## Define rewards

In [159]:
def get_reward(pos):
    # Get state
    s = states[coord_to_state(pos)]
    
    # Determine reward
    if s == 0: # normal
        return 0.0
    elif s == 1: # start
        return 0.0
    elif s == 2: # goal
        return 1.0
    elif s == 3: # wall
        return 0.0
    elif s == 4: # hole
        return -1.0

## Simulate random trajectories

In [169]:
num_actions = 4
steps = 100

start_idx = np.random.randint(starts.shape[0])
start_pos = starts[start_idx, :]
pos = start_pos

for i in range(steps):
    print("Old position: (%d, %d)" % (pos[0], pos[1]), end="   ")
    
    random_action = np.random.randint(num_actions)
    pos = make_action(random_action, pos)
    reward = get_reward(pos)
    is_terminal = is_terminal_state(pos)
    
    print("Action: %d" % random_action, end="   ")
    print("New position: (%d, %d)" % (pos[0], pos[1]), end="   ")
    print("Reward: %d" % reward)
    
    if is_terminal:
        break

Old position: (0, 0)   Action: 2   New position: (0, 1)   Reward: 0
Old position: (0, 1)   Action: 3   New position: (0, 1)   Reward: 0
Old position: (0, 1)   Action: 3   New position: (0, 1)   Reward: 0
Old position: (0, 1)   Action: 2   New position: (0, 2)   Reward: 0
Old position: (0, 2)   Action: 1   New position: (1, 2)   Reward: 0
Old position: (1, 2)   Action: 3   New position: (0, 2)   Reward: 0
Old position: (0, 2)   Action: 2   New position: (0, 3)   Reward: 0
Old position: (0, 3)   Action: 2   New position: (0, 4)   Reward: 0
Old position: (0, 4)   Action: 0   New position: (0, 3)   Reward: 0
Old position: (0, 3)   Action: 2   New position: (0, 4)   Reward: 0
Old position: (0, 4)   Action: 0   New position: (0, 3)   Reward: 0
Old position: (0, 3)   Action: 0   New position: (0, 2)   Reward: 0
Old position: (0, 2)   Action: 1   New position: (1, 2)   Reward: 0
Old position: (1, 2)   Action: 1   New position: (2, 2)   Reward: 0
Old position: (2, 2)   Action: 1   New position:

## Learn with a Q-table

In [202]:
Q = np.zeros([height*width, num_actions])
alpha = 1.0
gamma = 0.9
epsilon = 0.3 # fixed; could change to linear decay
learning_steps = 100000
verbose = False

start_idx = np.random.randint(starts.shape[0])
start_pos = starts[start_idx, :]
pos = start_pos

for i in range(learning_steps):
    # Get current state
    s1 = pos[0]*width + pos[1]
    
    # Make action and get reward, new state
    if random() > epsilon:
        # Make best action (exploit)
        #a = np.argmax(Q[s1, :]) # tie-break goes to lowest index
        a = np.random.choice(np.flatnonzero(Q[s1, :] == np.max(Q[s1, :]))) # random tie-breaking
    else:
        # Make random action (explore)
        a = np.random.randint(num_actions)
    new_pos = make_action(a, pos)
    r = get_reward(new_pos)
    s2 = new_pos[0]*width + new_pos[1]
    is_terminal = is_terminal_state(new_pos)
    
    # Update Q-table according to Bellman equation:
    # Q(s,a) <-- Q(s,a) + α*ΔQ(s,a), where ΔQ(s,a) = (r + γ*max(Q(s',a))) - Q(s,a
    Q[s1, a] = Q[s1, a] + alpha * (r + gamma * np.max(Q[s2, :]) - Q[s1, a])
    
    if verbose:
        print("Step %d" % i)
        print("Old position: (%d, %d)" % (pos[0], pos[1]), end="   ")
        print("Action: %d" % a, end="   ")
        print("New position: (%d, %d)" % (new_pos[0], new_pos[1]), end="   ")
        print("Reward: %d" % r)
    
    elif i % 1000 == 0:
        print("Step %d" % i)
    
    if is_terminal:
        start_idx = np.random.randint(starts.shape[0])
        start_pos = starts[start_idx, :]
        pos = start_pos
    else:
        pos = new_pos

Step 0
Step 1000
Step 2000
Step 3000
Step 4000
Step 5000
Step 6000
Step 7000
Step 8000
Step 9000
Step 10000
Step 11000
Step 12000
Step 13000
Step 14000
Step 15000
Step 16000
Step 17000
Step 18000
Step 19000
Step 20000
Step 21000
Step 22000
Step 23000
Step 24000
Step 25000
Step 26000
Step 27000
Step 28000
Step 29000
Step 30000
Step 31000
Step 32000
Step 33000
Step 34000
Step 35000
Step 36000
Step 37000
Step 38000
Step 39000
Step 40000
Step 41000
Step 42000
Step 43000
Step 44000
Step 45000
Step 46000
Step 47000
Step 48000
Step 49000
Step 50000
Step 51000
Step 52000
Step 53000
Step 54000
Step 55000
Step 56000
Step 57000
Step 58000
Step 59000
Step 60000
Step 61000
Step 62000
Step 63000
Step 64000
Step 65000
Step 66000
Step 67000
Step 68000
Step 69000
Step 70000
Step 71000
Step 72000
Step 73000
Step 74000
Step 75000
Step 76000
Step 77000
Step 78000
Step 79000
Step 80000
Step 81000
Step 82000
Step 83000
Step 84000
Step 85000
Step 86000
Step 87000
Step 88000
Step 89000
Step 90000
Step 91000
S

In [203]:
Q

array([[ 0.16677182,  0.18530202,  0.18530202,  0.16677182],
       [ 0.16677182,  0.20589113,  0.20589113,  0.18530202],
       [ 0.18530202,  0.22876792,  0.22876792,  0.20589113],
       [ 0.20589113,  0.25418658,  0.25418658,  0.22876792],
       [ 0.22876792,  0.28242954,  0.25418658,  0.25418658],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.9       ,  0.81      ,  1.        ,  0.9       ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.18530202,  0.20589113,  0.20589113,  0.16677182],
       [ 0.18530202, -1.        ,  0.22876792,  0.18530202],
       [ 0.20589113,  0.25418658,  0.25418658,  0.20589113],
       [ 0.22876792,  0.28242954,  0.28242954,  0.22876792],
       [ 0.25418658,  0.3138106 ,  0.28242954,  0.25418658],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.81      ,  0.729     ,  0.9       ,  0.9       ],
       [ 0.81      ,  0.81      ,  0.9       ,  1.        ],
       [ 0.20589113,  0.

## Test Q-table

In [218]:
start_idx = np.random.randint(starts.shape[0])
start_pos = starts[start_idx, :]
pos = start_pos
num_episodes = 10
trajectories = []

for i in range(num_episodes):
    is_terminal = False
    trajectories.append([])
    
    while not is_terminal:
        # Get current state
        s1 = pos[0]*width + pos[1]
        trajectories[i].append(s1)

        # Make action and get reward, new state
        #a = np.argmax(Q[s1, :]) # tie-break goes to lowest index
        a = np.random.choice(np.flatnonzero(Q[s1, :] == np.max(Q[s1, :]))) # random tie-breaking
        new_pos = make_action(a, pos)
        r = get_reward(new_pos)
        s2 = new_pos[0]*width + new_pos[1]
        is_terminal = is_terminal_state(new_pos)

        if verbose:
            print("Step %d" % i)
            print("Old position: (%d, %d)" % (pos[0], pos[1]), end="   ")
            print("Action: %d" % a, end="   ")
            print("New position: (%d, %d)" % (new_pos[0], new_pos[1]), end="   ")
            print("Reward: %d" % r)
        
        pos = new_pos
    
    trajectories[i].append(s2)
    start_idx = np.random.randint(starts.shape[0])
    start_pos = starts[start_idx, :]
    pos = start_pos

In [221]:
world = copy(states)
world[trajectories[0]] = -1
world.reshape([height, width])

array([[-1, -1,  0,  0,  0,  3,  0, -1],
       [ 0, -1, -1,  0,  0,  3,  0, -1],
       [ 0,  4, -1, -1, -1,  3, -1, -1],
       [ 0,  0,  0,  4, -1,  3, -1,  4],
       [ 0,  0,  0,  0, -1,  3, -1,  0],
       [ 0,  0,  3,  0, -1, -1, -1,  0],
       [ 0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  3,  0,  4,  0,  0,  0]], dtype=int16)

Optimal behavior!