<a href="https://colab.research.google.com/github/ibnerasheed/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/AML5204_Exercise_ValueIteration_Gridworld_Srikanth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import itertools
import numpy as np

# 5 sections of Value Iteration Assignment
1. Common functions reusable across 4x4 and 5x5
2. 4x4 specific gridworld definition, P_t and reward functions
3. 4x4 specific optimal policy calc  
4. 5x5 specific gridworld definition, P_t and reward functions
5. 5x5 specific optimal policy calc  

# Common reusable functions
1. action to unicode coversion
2. value iteration

In [None]:
## Map actions to unicode characters representing arrows
def action_to_unicode(action_dict):
  d = {(-1, 0):'\u2191', (0, 1):'\u2192', (1, 0):'\u2193', (0, -1):'\u2190'}
  for key, val in action_dict.items():
    action_dict[key] = ''.join([d.get(item) for item in val])
  return action_dict

In [None]:
# Input arguments to this function are functions themselves - either 4x4 or 5x5 grid specific functions are passed.
# this will work as long as these function arguments have the same input and output signature (to the reusable extent)
def value_iteration(gridworld_func, P_t_func, reward_func):
  M, N, S, A, nstates, gamma, _, _ = gridworld_func()

  # Initialize optimal policy for all states
  pi_optimal = dict.fromkeys(S, [])
  # Randomly initialize state value function
  v_old = np.zeros(nstates)
  tol = 1e-05 # stopping tolerance
  normdiff = np.inf

  maxiter = 1000
  iter = 1

  while normdiff > tol and iter <= maxiter:
    v_new = np.ones(nstates)*(-np.inf)
    for i in range(len(S)):
      for a in range(len(A)):      
        innersum = 0
        for j in range(len(S)):
          innersum += P_t_func(S[i],A[a], S[j]) * (reward_func(S[i],A[a], S[j]) + gamma*v_old[j])         
        if v_new[i] < innersum:
          v_new[i] = innersum
          pi_optimal[S[i]] = [] # clear list of actions 
          pi_optimal[S[i]].append(A[a])
        elif innersum == v_new[i]: 
          pi_optimal[S[i]].append(A[a])
    iter = iter+1 
    normdiff = np.linalg.norm(v_new - v_old)
    v_old = np.copy(v_new)

  return v_new, pi_optimal

# Gridworld 4x4 

## Gridworld 4x4 specific functions
1. grid definition function
2. P_t function
3. reward function

In [None]:
## Define the 4 x 4 gridworld
def gridworld_4x4():
  M = 4 # no of rows starting from the top left corner 
  N = 4
  # State space
  S = list(itertools.product(range(M), range(N)))
  # Action space
  A = [(-1,0), (0,1), (1,0),(0,-1)] # N, E, S, W
  nstates = len(S)
  gamma = 1.0
  # Terminal states
  terminal_states = [(0,0), (3,3)]

  #Last None is to match the return signature of 5x5 gridworld config function
  return M, N, S, A, nstates, gamma, terminal_states, None

In [None]:
## Function to return the transition probability
def P_t_4x4(start_state, action, end_state):
  M, N, S, A, nstates, gamma, terminal_states, _ = gridworld_4x4()

  # If start state is a terminal state
  if start_state in terminal_states:
    return 0

  # Otherwise calculate the new proposed state
  # according to the action
  proposed_state = tuple(np.array(start_state) + np.array(action))
  
  # If the proposed state is outside the grid (illegal action)
  if any(val in proposed_state for val in [M, N, -1]):
    if start_state == end_state:
      return 1
    else:
      return 0

  # Otherwise, if action is legal
  # Check if proposed state and end state are the same  
  if proposed_state == end_state:
    return 1
  else:
    return 0

In [None]:
## Function to return reward
def reward_4x4(start_state, action, end_state):
  M, N, S, A, nstates, gamma, terminal_states, _ = gridworld_4x4()
  
  if start_state in terminal_states:
    return 0
  else:
    return -1  

## Invoke calc for 4x4 optimal Policy and v_new 

In [None]:
## Get optimal policy for 4x4 grid world
def optimal_policy_4x4():
  M, N, S, A, nstates, gamma, terminal_states, _ = gridworld_4x4()
  v_new, pi_optimal = value_iteration(gridworld_4x4, P_t_4x4, reward_4x4)
  print(v_new.reshape((M,N)))
  print( 
      np.array(
          list(action_to_unicode(pi_optimal).values())
       ).reshape((M, N)) 
  )

In [None]:
optimal_policy_4x4()

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
[['↑→↓←' '←' '←' '↓←']
 ['↑' '↑←' '↑→↓←' '↓']
 ['↑' '↑→↓←' '→↓' '↓']
 ['↑→' '→' '→' '↑→↓←']]


# Gridworld 5x5

## Gridworld 5x5 specific functions
1. grid definition function
2. P_t function
3. reward function

In [None]:
## Define the 5 x 5 gridworld
def gridworld_5x5():
  M = 5 # no of rows starting from the top left corner 
  N = 5
  # State space
  S = list(itertools.product(range(M), range(N)))
  # Action space
  A = [(-1,0), (0,1), (1,0),(0,-1)] # N, E, S, W
  nstates = len(S)
  gamma = 0.9
  # Special states A, B and A', B' 
  special_states = {'start':[(0, 1), (0, 3)], 'end':[(4, 1), (2, 3)]}
  special_state_rewards = {(0, 1):10, (0, 3):5}

  return M, N, S, A, nstates, gamma, special_states, special_state_rewards

In [None]:
# Function to return the transition probability
def P_t_5x5(start_state, action, end_state):
  M, N, S, A, nstates, gamma, special_states, _ = gridworld_5x5()

  if start_state in special_states['start']:
    index = special_states['start'].index(start_state)
    if end_state == special_states['end'][index]:
      return 1
    else:
      return 0  

  # calculate new proposed state
  # according to the action
  proposed_state = tuple(np.array(start_state) + np.array(action))
  
  # If the proposed state is outside the grid (illegal action)
  if (proposed_state not in S):
    if start_state == end_state:
      return 1
    else:
      return 0

  # Otherwise, if action is legal
  # Check if proposed state and end state are the same  
  if proposed_state == end_state:
    return 1
  else:
    return 0

In [None]:
## Function to return reward
def reward_5x5(start_state, action, end_state):
  M, N, S, A, nstates, gamma, special_states, special_state_rewards = gridworld_5x5()
  
  if start_state in special_states['start']:
    return special_state_rewards[start_state]
    
  # calculate new proposed state
  # according to the action
  proposed_state = tuple(np.array(start_state) + np.array(action))
  
  # If the proposed state is outside the grid (illegal action)
  if (proposed_state not in S):
    return -1

  # Otherwise, if action is legal
  return 0

## Invoke calc for 5x5 optimal Policy and v_new

In [None]:
## Get optimal policy for 5x5 grid world
def optimal_policy_5x5():
  M, N, S, A, nstates, gamma, special_states, special_state_rewards = gridworld_5x5()
  v_new, pi_optimal = value_iteration(gridworld_5x5, P_t_5x5, reward_5x5)
  print(v_new.reshape((M,N)))
  print( 
      np.array(
          list(action_to_unicode(pi_optimal).values())
       ).reshape((M, N)) 
  )

In [None]:
optimal_policy_5x5()

[[21.97747666 24.41941851 21.97747666 19.41941851 17.47747666]
 [19.77972899 21.97747666 19.77972899 17.80175609 16.02157612]
 [17.80175609 19.77972899 17.80175609 16.02157612 14.41941851]
 [16.02157612 17.80175609 16.02157612 14.41941851 12.97747666]
 [14.41941851 16.02157612 14.41941851 12.97747666 11.67972899]]
[['→' '↑→↓←' '←' '↑→↓←' '←']
 ['↑→' '↑' '↑←' '←' '←']
 ['↑→' '↑' '↑←' '↑←' '↑←']
 ['↑→' '↑' '↑←' '↑←' '↑←']
 ['↑→' '↑' '↑←' '↑←' '↑←']]
