In [11]:
import numpy as np
import matplotlib.pyplot as plt
from enum import IntEnum

In [12]:
class Action(IntEnum):
  UP = 0
  DOWN = 1
  LEFT = 2
  RIGHT = 3

In [13]:
class State(IntEnum):
  ACCESSIBLE_GRID = 0
  INACCESSIBLE_GRID = -2
  LOSER_GRID = -1
  WINNER_GRID = 1

In [14]:
STATE_PROBS = [0.7, 0.2, 0.1] # prob of accessible grid, prob of inaccessible grid, prob of loser grid
STATES = [State.ACCESSIBLE_GRID, State.INACCESSIBLE_GRID, State.LOSER_GRID, State.WINNER_GRID]
UNKNOWN_POLICY = -2 # the policy is unknown for now, the policies are going to be determined after creating the gridworld
ACTIONS = [Action.UP, Action.DOWN, Action.LEFT, Action.RIGHT]
ROW_SIZE = 10
COLUMN_SIZE = 10
THRESHOLD = 1e-3
DISCOUNT_FACTOR = 0.9

In [15]:
class Gridworld:
  def __init__(self, row_size, column_size, start_position):
    assert(row_size * column_size >= 10) # creating a complex gridworld
    self.gridworld, self.winner_position = self.create_gridworld(row_size, column_size)
    row, column = start_position
    assert(self.gridworld[row, column][0] == 0) # the starting position should be accessible and should not cause the termination of the game
    self.start_position = start_position
    self.position = start_position
    # print(self.gridworld) # For debug purposes
    self.available_moves = self.check_available_moves(row_size, column_size)
    # print(self.available_moves) # For debug purposes
    self.create_policy(row_size, column_size)
    print(self.gridworld) # For debug purposes

  def create_gridworld(self, row_size, column_size):
    out = np.empty((row_size, column_size), dtype=object)
    total_number_of_grids = row_size * column_size
    number_of_accessible_grid = int(total_number_of_grids * STATE_PROBS[State.ACCESSIBLE_GRID])
    number_of_inaccessible_grid = int(total_number_of_grids * STATE_PROBS[State.INACCESSIBLE_GRID])
    # We subtract the number of winner grid which is 1.
    number_of_loser_grid = total_number_of_grids - number_of_accessible_grid - number_of_inaccessible_grid - 1
    # create a distribution of states
    new_state_probs = np.array([number_of_accessible_grid, number_of_inaccessible_grid, number_of_loser_grid]) / (total_number_of_grids - 1)
    # determine the winner state for policy creation
    winner_position_row = np.random.choice(row_size - 1)
    winner_position_column = np.random.choice(column_size - 1)
    state = np.random.choice(STATES[:-1], p=new_state_probs, size=(ROW_SIZE, COLUMN_SIZE))
    out[winner_position_row, winner_position_column] = [1, UNKNOWN_POLICY, UNKNOWN_POLICY]
    for i in range(row_size):
      for j in range(column_size):
        if i == winner_position_row and j == winner_position_column:
          continue
        out[i, j] = [state[i, j], UNKNOWN_POLICY, UNKNOWN_POLICY]
    return out, (winner_position_row, winner_position_column)

  def game_over(self):
    row, column = self.starting_position
    return self.gridworld[row, column][0] % 10 == 0 # whether we lost or won the game terminates

  # helper method which doesn't take extreme cases into consideration
  def move_calculate_position(self, position, action):
    if action == Action.UP:
      return [position[0] - 1, position[1]]
    elif action == Action.DOWN:
      return [position[0] + 1, position[1]]
    elif action == Action.LEFT:
      return [position[0], position[1] - 1]
    elif action == Action.RIGHT:
      return [position[0], position[1] + 1]

  def move_simulation(self, position, action):
    can_move = True
    not_going_to_move_up_or_down = (position[0] == 0 and action == Action.UP) or (position[0] == ROW_SIZE - 1 and action == Action.DOWN)
    not_going_to_move_left_or_right = (position[1] == 0 and action == Action.LEFT) or (position[1] == COLUMN_SIZE - 1 and action == Action.RIGHT)
    can_move = can_move and not(not_going_to_move_up_or_down or not_going_to_move_left_or_right)
    if can_move:
      row, column = self.move_calculate_position(position, action)
      # check if we step into the inaccessible grid
      can_move = can_move and self.gridworld[row, column][0] != -2
    return can_move, [row, column] if can_move else position

  def create_transition_probs(self, position, action):
    can_move = True
    not_going_to_move_up_or_down = (position[0] == 0 and action == Action.UP) or (position[0] == ROW_SIZE - 1 and action == Action.DOWN)
    not_going_to_move_left_or_right = (position[1] == 0 and action == Action.LEFT) or (position[1] == COLUMN_SIZE - 1 and action == Action.RIGHT)
    can_move = can_move and not(not_going_to_move_up_or_down or not_going_to_move_left_or_right)
    if can_move:
      row, column = self.move_calculate_position(position, action)
      # check if we step into the invalid grid
      can_move = can_move and self.gridworld[row, column][0] == 0
    return can_move, (row, column) if can_move else position

  def move(self, action):
    self.position = list(self.move_simulation(self.position, action))

  def check_available_moves(self, row_size, column_size):
    moves = np.empty((row_size, column_size), dtype=object)
    for i in range(row_size):
      for j in range(column_size):
        move_list = []
        if self.gridworld[i, j][0] != 0:
          moves[i, j] = move_list
          continue # non-playable grid
        if self.move_simulation([i, j], Action.DOWN)[0]:
          move_list.append(Action.DOWN)
        if self.move_simulation([i, j], Action.UP)[0]:
          move_list.append(Action.UP)
        if self.move_simulation([i, j], Action.LEFT)[0]:
          move_list.append(Action.LEFT)
        if self.move_simulation([i, j], Action.RIGHT)[0]:
          move_list.append(Action.RIGHT)
        moves[i, j] = move_list
    return moves

  def create_policy(self, row_size, column_size):
    winner_row, winner_column = self.winner_position
    for i in range(row_size):
      for j in range(column_size):
        action_up_down = -1
        action_left_right = -1
        if self.gridworld[i, j][0] < 0:
          continue # non-playable grid
        elif winner_row > i and self.move_simulation([i, j], Action.DOWN)[0]:
          action_up_down = Action.DOWN
        elif winner_row < i and self.move_simulation([i, j], Action.UP)[0]:
          action_up_down = Action.UP
        if winner_column < j and self.move_simulation([i, j], Action.LEFT)[0]:
          action_left_right = Action.LEFT
        elif winner_column > j and self.move_simulation([i, j], Action.RIGHT)[0]:
          action_left_right = Action.RIGHT
        self.gridworld[i, j][1] = action_up_down
        self.gridworld[i, j][2] = action_left_right

In [32]:
gridworld = Gridworld(ROW_SIZE, COLUMN_SIZE, [ROW_SIZE - 1, 0]);

[[list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, -1]) list([-2, -2, -2]) list([-2, -2, -2])
  list([0, <Action.DOWN: 1>, -1]) list([-2, -2, -2])
  list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, -1]) list([-1, -2, -2])]
 [list([0, -1, <Action.RIGHT: 3>]) list([0, -1, -1]) list([-2, -2, -2])
  list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  list([0, -1, <Action.RIGHT: 3>]) list([0, -1, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, -1]) list([-2, -2, -2])]
 [list([-2, -2, -2]) list([-2, -2, -2]) list([0, -1, <Action.RIGHT: 3>])
  list([0, -1, <Action.RIGHT: 3>]) list([0, -1, <Action.RIGHT: 3>])
  list([0, <Action.DOWN: 1>, -1]) list([-2, -2, -2]) list([-2, -2, -2])
  list([0, <Action.DOWN: 1>, -1]) list([0, -1, <Action.LEFT: 2>])]
 [list([0, <Action.DOWN: 1>, <Action.RIGHT: 3>])
  li

In [33]:
transition_probs = {}
rewards = {}
value_function = {}
for i in range(ROW_SIZE):
  for j in range(COLUMN_SIZE):
    state = (i, j)
    value_function[state] = 0
    for action in ACTIONS:
      can_move, next_state = gridworld.create_transition_probs(state, action)
      if can_move:
        transition_probs[(state, action, next_state)] = 1
        rewards[(state, action, next_state)] = - 1

In [34]:
it = 0
while True:
  error = 0
  for i in range(ROW_SIZE):
    for j in range(COLUMN_SIZE):
      state = (i, j)
      old_value = value_function[state]
      new_value = 0
      grid = gridworld.gridworld[i, j]
      policies = [grid[1], grid[2]]
      for action in ACTIONS:
        can_move, next_state = gridworld.create_transition_probs(state, action)
        if can_move:
          # The action probability is deterministic.
          action_prob = 1 if action in policies else 0
          reward = rewards.get((state, action, next_state), 0)
          new_value += action_prob * transition_probs.get((state, action, next_state), 0) * (reward + DISCOUNT_FACTOR * value_function[next_state])
      value_function[state] = new_value
      error = max(error, np.abs(old_value - new_value))
  print(f"Iteration: {it + 1}, Error: {error}")
  it += 1

  if error < THRESHOLD:
    break

Iteration: 1, Error: 2.9
Iteration: 2, Error: 3.5999999999999996
Iteration: 3, Error: 6.48
Iteration: 4, Error: 11.664
Iteration: 5, Error: 20.339100000000002
Iteration: 6, Error: 27.753030000000003
Iteration: 7, Error: 21.78908099999999
Iteration: 8, Error: 14.348907000000025
Iteration: 9, Error: 0


In [35]:
def print_value_function(value_function):
  values = np.zeros((ROW_SIZE, COLUMN_SIZE))
  for key, value in value_function.items():
    i, j = key
    values[i, j] = value
  print(values)

In [36]:
print_value_function(value_function)

[[ -3.8       -1.         0.         0.        -7.20559    0.
   -9.317     -5.42      -1.9        0.      ]
 [ -1.         0.         0.        -9.91559   -6.8951    -4.439
   -2.71      -1.9       -1.         0.      ]
 [  0.         0.        -2.71      -1.9       -1.         0.
    0.         0.         0.        -1.      ]
 [-97.945418 -36.54136    0.         0.         0.         0.
    0.         0.         0.         0.      ]
 [-70.06466  -39.4904   -20.5364    -9.569     -3.8       -1.
    0.         0.         0.        -6.707   ]
 [-36.137    -21.1196   -11.027     -4.61      -1.         0.
    0.        -2.71       0.        -5.23    ]
 [-16.8104   -10.217     -5.42      -1.9        0.         0.
    0.        -1.9       -1.        -4.7     ]
 [ -6.239     -3.71      -1.9       -1.         0.         0.
    0.         0.         0.        -2.      ]
 [ -1.         0.         0.         0.         0.         0.
    0.         0.         0.         0.      ]
 [ -6.968     -4