# Homework 7

## Imports and Utilities
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
from collections import defaultdict
import abc
import numpy as np


class MDP:
    """A Markov Decision Process."""

    @property
    @abc.abstractmethod
    def state_space(self):
        """Representation of the MDP state set.

        Unless otherwise stated, assume this is a set.
        """
        raise NotImplementedError("Override me")

    @property
    @abc.abstractmethod
    def action_space(self):
        """Representation of the MDP action set.

        Unless otherwise stated, assume this is a set.
        """
        raise NotImplementedError("Override me")

    @property
    def temporal_discount_factor(self):
        """Gamma, defaults to 1.
        """
        return 1.

    @property
    def horizon(self):
        """H, defaults to inf.
        """
        return float("inf")

    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states.

        Defaults to False.

        Args:
            state: A state.

        Returns:
            is_terminal : A bool.
        """
        return False

    @abc.abstractmethod
    def get_reward(self, state, action, next_state):
        """Return (deterministic) reward for executing action
        in state.

        Args:
            state: A current state.
            action: An action.
            next_state: A next state.

        Returns:
            reward : Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_distribution(self, state, action):
        """Return a distribution over next states.

        Unless otherwise stated, assume that this returns
        a dictionary mapping states to probabilities. For
        example, if the state space were {0, 1, 2}, then
        this function might return {0: 0.3, 1: 0.2, 2: 0.5}.

        Args:
            state: A current state.
            action: An action.

        Returns:
            next_state_distribution: Distribution over next states.
        """
        raise NotImplementedError("Override me")

    def sample_next_state(self, state, action, rng=np.random):
        """Sample a next state from the transition distribution.

        This function may be overwritten by subclasses when the explicit
        distribution is too large to enumerate.

        Args:
            state: A state from the state space.
            action: An action from the action space.
            rng: A random number generator.

        Returns:
            next_state: A sampled next state from the state space.
        """
        next_state_dist = self.get_transition_distribution(state, action)
        next_states, probs = zip(*next_state_dist.items())
        next_state_index = rng.choice(len(next_states), p=probs)
        next_state = next_states[next_state_index]
        return next_state


class SingleRowMDP(MDP):
    """A 1D grid MDP for debugging. The grid is 1x5
    and the agent is meant to start off in the middle.
    There is +10 reward on the rightmost square, -10 on
    the left. Actions are left and right. An action effect
    is reversed with 10% probability.
    """
    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}  # position in grid

    @property
    def action_space(self):
        return {0, 1}  # left, right

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        delta = 1 if action == 1 else -1
        intended_effect = min(max(state + delta, 0), 4)
        opposite_effect = min(max(state - delta, 0), 4)
        assert (intended_effect != opposite_effect)
        return {intended_effect: 0.9, opposite_effect: 0.1}

    def get_reward(self, state, action, next_state):
        if next_state == 0:
          return -10
        if next_state == 4:
          return 10
        return -1  # living penalty

    def state_is_terminal(self, state):
        return state in {0, 4}


class MarshmallowMDP(MDP):
    """The Marshmallow MDP described in lecture."""

    @property
    def state_space(self):
        # (hunger level, marshmallow remains)
        return {(h, m) for h in {0, 1, 2} for m in {True, False}}

    @property
    def action_space(self):
        return {"eat", "wait"}

    @property
    def horizon(self):
        return 4

    def get_reward(self, state, action, next_state):
        next_hunger_level = next_state[0]
        return -(next_hunger_level**2)

    def get_transition_distribution(self, state, action):
        # Update marshmallow deterministically
        if action == "eat":
            next_m = False
        else:
            next_m = state[1]

        # Initialize next state distribution dict
        # Any state not included assumed to have 0 prob
        dist = defaultdict(float)

        # Update hunger
        if action == "wait" or state[1] == False:
            # With 0.75 probability, hunger stays the same
            dist[(state[0], next_m)] += 0.75
            # With 0.25 probability, hunger increases by 1
            dist[(min(state[0] + 1, 2), next_m)] += 0.25

        else:
            assert action == "eat" and state[1] == True
            # Hunger deterministically set to 1 after eating
            dist[(0, next_m)] = 1.0

        return dist


class ZitsMDP(MDP):
    """The Zits MDP described in lecture."""
    
    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}

    @property
    def action_space(self):
        return {"apply", "sleep"}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def get_reward(self, state, action, next_state):
        if action == "apply":
            return -1 - next_state
        assert action == "sleep"
        return -next_state

    def get_transition_distribution(self, state, action):
        if action == "apply":
            return {
                0: 0.8,
                4: 0.2
            }
        assert action == "sleep"
        return {
            min(state + 1, 4): 0.4,
            max(state - 1, 0): 0.6
        }


class ChaseMDP(MDP):
    """A 2D grid bunny chasing MDP."""

    @property
    def obstacles(self):
        return np.zeros((2, 3))  # by default, 2x3 grid with no obstacles

    @property
    def goal_reward(self):
        return 1

    @property
    def living_reward(self):
        return 0

    @property
    def height(self):
        return self.obstacles.shape[0]

    @property
    def width(self):
        return self.obstacles.shape[1]

    @property
    def state_space(self):
        pos = [(r, c) for r in range(self.height) for c in range(self.width)]
        return {(p1, p2) for p1 in pos for p2 in pos}

    @property
    def action_space(self):
        return {'up', 'down', 'left', 'right'}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def action_to_delta(self, action):
        return {
            'up': (-1, 0),  # up,
            'down': (1, 0),  # down,
            'left': (0, -1),  # left,
            'right': (0, 1),  # right,
        }[action]

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        next_state_dist = defaultdict(float)

        agent_pos, goal_pos = state

        # Get next agent state
        row, col = agent_pos
        dr, dc = self.action_to_delta(action)
        r, c = row + dr, col + dc
        # Stay in place if out of bounds or obstacle
        if not (0 <= r < self.height and 0 <= c < self.width):
            r, c = row, col
        elif self.obstacles[r, c]:
            r, c = row, col
        next_agent_pos = (r, c)

        # Get next bunny state
        # Stay in same place with probability 0.5
        next_state_dist[(next_agent_pos, goal_pos)] += 0.5
        # Otherwise move
        row, col = goal_pos
        for (dr, dc) in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            r, c = row + dr, col + dc
            # Stay in place if out of bounds or obstacle
            if not (0 <= r < self.height and 0 <= c < self.width):
                r, c = row, col
            elif self.obstacles[r, c]:
                r, c = row, col
            next_goal_pos = (r, c)
            next_state_dist[(next_agent_pos, next_goal_pos)] += 0.5*0.25

        return next_state_dist

    def get_reward(self, state, action, next_state):
        agent_pos, goal_pos = next_state
        if agent_pos == goal_pos:
            return self.goal_reward
        return self.living_reward

    def state_is_terminal(self, state):
        agent_pos, goal_pos = state
        return agent_pos == goal_pos


class LargeChaseMDP(ChaseMDP):
    """A larger 2D grid bunny chasing MDP."""

    @property
    def obstacles(self):
        return np.array([
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 0, 0, 1, 1],
            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
            [0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
            [0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ])



## Problems

### Wait, Bellman, Backup!
Complete the implementation of the bellman backup for an infinite or indefinite horizon MDP.

For reference, our solution is **12** lines of code.

In [None]:
def bellman_backup(s, V, mdp):
  """Look ahead one step and propose an update for the value of s.

  You can assume that the mdp is either infinite or indefinite
  horizon (that is, mdp.horizon is inf).

  It is possible to handle terminal states either here or in
  value iteration. For consistency with our solution, please
  handle terminal states in value iteration, not here.

  Args:
      s: A state.
      V: A dict, V[state] -> value.
      mdp: An MDP.

  Returns:
      vs: new value estimate for s.
  """
  re = -1000
  for a in mdp.action_space:
    temp = 0
    t = mdp.get_transition_distribution(s, a)
    for next in t.keys():
      p = t[next]
      r = mdp.get_reward(s,a,next)
      temp += p*(r+mdp.temporal_discount_factor*V[next])
    re = max(re,temp)
  return re




Tests

In [None]:
def test1_bellman_backup():
    mdp = SingleRowMDP()
    s = 3
    V = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
    new_V_s = bellman_backup(s, V, mdp)
    # Bellman backup should not change V
    assert V == {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
    assert new_V_s == 0.9 * 10 + 0.1 * -1
    s = 2
    new_V_s = bellman_backup(s, V, mdp)
    assert new_V_s == -1.

test1_bellman_backup()
def test2_bellman_backup():
    mdp = ZitsMDP()
    V = {s : 0 for s in mdp.state_space}
    assert bellman_backup(0, V, mdp) == -0.4
    assert bellman_backup(1, V, mdp) == -0.8
    assert bellman_backup(2, V, mdp) == -1.8
    assert bellman_backup(3, V, mdp) == -1.8
    assert bellman_backup(4, V, mdp) == -1.8

test2_bellman_backup()
def test3_bellman_backup():
    mdp = ZitsMDP()
    V = {0 : -0.1, 1: 0.1, 2: 5, 3: -4, 4: -2.2}
    assert abs(bellman_backup(0, V, mdp) - -0.418) < 1e-5
    assert abs(bellman_backup(1, V, mdp) - 0.946) < 1e-5
    assert abs(bellman_backup(2, V, mdp) - -2.268) < 1e-5
    assert abs(bellman_backup(3, V, mdp) - -0.892) < 1e-5
    assert abs(bellman_backup(4, V, mdp) - -2.268) < 1e-5

test3_bellman_backup()
print('Tests passed.')

Tests passed.


### There's Value in that Iteration
Complete the implementation of value iteration for an infinite or indefinite horizon MDP.

For reference, our solution is **19** lines of code.

In addition to all of the utilities defined at the top of the colab notebook, the following functions are available in this question environment: `bellman_backup`. You may not need to use all of them.

In [None]:
def value_iteration(mdp, max_num_iters=1000, change_threshold=0.0001):
  """Run value iteration for a certain number of iterations or until
  the max change between iterations is below a threshold.

  Specifically, you should terminate when:
      (max_{s} |V(s) - V'(s)|) < change_threshold
  where V is the old value function estimate, V' is the new one,
  and |*| denotes absolute value.

  You can assume that the mdp is either infinite or indefinite
  horizon (that is, mdp.horizon is inf).

  Make sure to handle terminal states! You will need to think about
  what behavior we should expect from value iteration exactly to
  deal with terminal states, and then implement that behavior.

  Args:
      mdp: An MDP.
      max_num_iters: An int representing the maximum number of
          iterations to run value iteration before giving up.
      change_threshold: A float used to determine when value iteration
          has converged and it is safe to terminate.

  Returns: 
      V:  A dict, V[state] -> value.
  """
  values = {}
  no_terminal_state = set()
  for s in mdp.state_space:
    if not mdp.state_is_terminal(s):
      no_terminal_state.add(s)
  for s in mdp.state_space:
    values[s] = 0
  for _ in range(max_num_iters):
    new_val = {}
    changed = False
    for s in no_terminal_state:
      #print(bellman_backup(s,values,mdp))
      new_val[s] = bellman_backup(s,values,mdp)
      if abs(new_val[s] - values[s]) >= change_threshold: changed = True
    for s in no_terminal_state:
      values[s] = new_val[s]
    print(values)
    if not changed:
      #print(values)
      return values
    
  print(values)
  return values
      


Tests

In [None]:
def test1_value_iteration():
    mdp = SingleRowMDP()
    V = value_iteration(mdp)
    expected_V = {0: 0.0, 1: 5.58531, 2: 8.31706, 3: 9.73170, 4: 0.0}
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4

test1_value_iteration()#{0,1}
def test2_value_iteration():
    mdp = ZitsMDP()
    V = value_iteration(mdp)
    expected_V = {0: -6.40530, 1: -7.07368, 2: -7.81918, 3: -7.81918, 4: -7.81918}
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4


test2_value_iteration()#{'sleep', 'apply'}
def test3_value_iteration():
    mdp = SingleRowMDP()
    expected_V = {0: 0.0, 1: -1.9, 2: -1.0, 3: 8.9, 4: 0.0}
    V = value_iteration(mdp, max_num_iters=1)
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4
    V = value_iteration(mdp, change_threshold=float("inf"))
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4

test3_value_iteration()#{0,1}
def test4_value_iteration():
    mdp = ChaseMDP()
    V = value_iteration(mdp)
    partial_expected_V = {((0, 1), (0, 1)): 0.0, ((0, 1), (1, 0)): 0.87506,
                          ((1, 0), (0, 2)): 0.80601, ((0, 2), (1, 2)): 0.96536,
                          ((1, 1), (0, 1)): 0.94896}
    for s in partial_expected_V:
        assert abs(V[s] - partial_expected_V[s]) < 1e-4

test4_value_iteration()#{'up', 'left', 'right', 'down'}
print('Tests passed.')

{0: 0, 1: -1.9, 2: -1.0, 3: 8.9, 4: 0}
{0: 0, 1: -2.8, 2: 6.82, 3: 8.8, 4: 0}
{0: 0, 1: 4.238, 2: 6.640000000000001, 3: 9.582, 4: 0}
{0: 0, 1: 4.0760000000000005, 2: 8.047600000000001, 3: 9.564, 4: 0}
{0: 0, 1: 5.342840000000001, 2: 8.0152, 3: 9.70476, 4: 0}
{0: 0, 1: 5.313680000000001, 2: 8.268568, 3: 9.70152, 4: 0}
{0: 0, 1: 5.5417112, 2: 8.262736, 3: 9.7268568, 4: 0}
{0: 0, 1: 5.5364624000000005, 2: 8.30834224, 3: 9.7262736, 4: 0}
{0: 0, 1: 5.577508016, 2: 8.307292480000001, 3: 9.730834224, 4: 0}
{0: 0, 1: 5.576563232000001, 2: 8.315501603200001, 3: 9.730729248, 4: 0}
{0: 0, 1: 5.583951442880001, 2: 8.315312646399999, 3: 9.73155016032, 4: 0}
{0: 0, 1: 5.583781381759999, 2: 8.316790288576, 3: 9.73153126464, 4: 0}
{0: 0, 1: 5.5851112597184, 2: 8.316756276351999, 3: 9.7316790288576, 4: 0}
{0: 0, 1: 5.585080648716799, 2: 8.317022251943682, 3: 9.7316756276352, 4: 0}
{0: 0, 1: 5.585320026749313, 2: 8.31701612974336, 3: 9.731702225194368, 4: 0}
{0: 0, 1: 5.585314516769024, 2: 8.31706400534

### Expectimax Search
Complete the implementation of expectimax search for a finite horizon MDP.

For reference, our solution is **15** lines of code.

In [None]:
def expectimax_search(initial_state, mdp, horizon):
  """Use expectimax search to determine a next action.

  Note that we're just computing the single next action to
  take, we do not need to store the entire partial V.

  Horizon is given as a separate argument so that we can use
  expectimax search with receding horizon control, for example,
  even if mdp.horizon is inf.

  Args:
      initial_state: A state in the mdp.
      mdp: An MDP.
      horizon: An int horizon.

  Returns:
      action: An action in the mdp.
  """
  def V(next,t,mdp,horizon):
    if t == horizon:
      return 0
    re = -1000
    for a in mdp.action_space:
      temp = Q(next,a,t,mdp,horizon)
      re = max(re,temp)
    return re

  def Q(s,a,t,mdp,horizon):
    result = 0
    transition = mdp.get_transition_distribution(s, a)
    for next in transition.keys():
      p = transition[next]
      r = mdp.get_reward(s,a,next)
      result += p*(r+mdp.temporal_discount_factor*V(next,t+1,mdp,horizon))
    return result

  currentmax = -10000
  maxAct = None
  for action in mdp.action_space:
    if Q(initial_state,action,0,mdp,horizon) > currentmax:
      maxAct = action
      currentmax=Q(initial_state,action,0,mdp,horizon)
  return maxAct

Tests

In [None]:
def test1_expectimax_search():
    mdp = MarshmallowMDP()
    assert expectimax_search((0, True), mdp, mdp.horizon) == "wait"
    assert expectimax_search((0, True), mdp, 1) == "eat"
    assert expectimax_search((1, True), mdp, mdp.horizon) == "eat"
    assert expectimax_search((2, True), mdp, mdp.horizon) == "eat"
    assert expectimax_search((1, True), mdp, 10) == "wait"

test1_expectimax_search()
def test2_expectimax_search():
    mdp = ChaseMDP()
    assert expectimax_search(((0, 0), (0, 1)), mdp, 1) == "right"
    assert expectimax_search(((0, 0), (0, 2)), mdp, 2) == "right"
    assert expectimax_search(((0, 0), (1, 0)), mdp, 1) == "down"
    assert expectimax_search(((0, 0), (1, 2)), mdp, 2) in ["right", "down"]
    assert expectimax_search(((1, 2), (0, 0)), mdp, 2) in ["up", "left"]

test2_expectimax_search()
print('Tests passed.')

Tests passed.
