# Write-up and code for Jan 18

## Policy Evaluation

In [1]:
def policy_eval(mdp: MDP, policy: Policy, n_iter: int) -> state_value_function:
    # implementation of policy evaluation
    vf = {s: 0. for s in mdp.States}
    for i in range(n_iter):
        new_vf = {}
        for s in mdp.States:
            new_vf[s] = 0
            for a in policy[s]:
                new_vf[s] += policy[s][a]*mdp.R[s][a]
                for sp in mdp.P[s][a]:
                    new_vf[s] += policy[s][a]*mdp.gamma*mdp.P[s][a][sp]*vf[sp]
        vf = new_vf
    return vf

NameError: name 'MDP' is not defined

### Example Policy Evaluation
We continue in the example from the last assignment.

In [5]:
def is_in_grid(state: Tuple[int, int], size: int) -> bool:
    # helper function to check whether a state is in the grid
    return  state[0] >= 0 and state[0] < size and state[1] >= 0 and state[1] < size


def get_neighbor_states(state: Tuple[int, int], size: int) -> Set[Tuple[int, int]]:
    # function to return a set of neighboring states in the grid
    nbr_states = set()
    
    up_state = s[0]-1, s[1]
    if is_in_grid(up_state, size):
        nbr_states.add(up_state)
        
    down_state = s[0]+1, s[1]
    if is_in_grid(down_state, size):
        nbr_states.add(down_state)
        
    left_state = s[0], s[1]-1
    if is_in_grid(left_state, size):
        nbr_states.add(left_state)
        
    right_state = s[0], s[1]+1
    if is_in_grid(right_state, size):
        nbr_states.add(right_state)
    
    return nbr_states


def get_neighbor_direction(s: S, sp: S) -> int:
    # function to figure out in which direction the state sp is from state s
    # assume that both states are adjacent
    if s[1] > sp[1]:
        # sp is to the left of s
        return 1
    elif s[1] < sp[1]:
        # sp is to the right of s
        return 2
    elif s[0] > sp[0]:
        # sp is above s
        return 3
    elif s[0] < sp[0]:
        # sp is below s
        return 4
    else:
        # sp is equal to s
        return 0

In [6]:
# define the gridworld parameters
States = set()
P = {}
A = set()
for i in range(4):
    # 1 is move left, 2 is move right, 3 is up, 4 is down
    A.add(i+1)
    for j in range(4):
        state = (i, j)
        States.add(state)

for s in States:
    P[s] = {}
    nbrs = get_neighbor_states(s, 4)
    for a in A:
        P[s][a] = {}
        if s == (0,0) or s == (3,3):
            P[s][a][s] = 1.0
        else:
            agg_p = 0
            for sp in nbrs:
                if get_neighbor_direction(s, sp) == a:
                    P[s][a][sp] = 0.7
                    agg_p += 0.7
                else:
                    P[s][a][sp] = 0.1
                    agg_p += 0.1
            if len(nbrs) < 4:
                P[s][a][s] = 1. - agg_p

In [7]:
# here the reward is just a function of the current state
R = {}
for s in States:
    R[s] = {}
    for a in A:
        if s == (0,0) or s == (3,3):
            R[s][a] = 3.
        elif s == (1,2):
            R[s][a] = -2.
        else:
            R[s][a] = 0.
gamma = 0.9

In [8]:
mdp = MDP(States, P, A, R, gamma)

In [9]:
policy = {}
for s in mdp.States:
    policy[s] = {}
    for a in A:
        if a == 2:
            policy[s][a] = 1.0
        else:
            policy[s][a] = 0.

In [10]:
vf = policy_eval(mdp, policy, 10)
for s in sorted(vf):
    print(s, vf[s])

(0, 0) 19.539646797000003
(0, 1) 1.5913445137792583
(0, 2) -0.13172112051440504
(0, 3) -0.09178616625516067
(1, 0) 1.3081295711322876
(1, 1) -0.7832546586645136
(1, 2) -1.782277895789884
(1, 3) 0.20173080195345502
(2, 0) 2.229352425175423
(2, 1) 2.8046556591403595
(2, 2) 3.612483600758209
(2, 3) 4.372277018613442
(3, 0) 6.560786640943631
(3, 1) 9.496772213748145
(3, 2) 13.770260155040685
(3, 3) 19.539646797000003


In [11]:
vf = policy_eval(mdp, policy, 100)
for s in sorted(vf):
    print(s, vf[s])

(0, 0) 29.99920315803337
(0, 1) 4.338507931135843
(0, 2) 1.6312225534949574
(0, 3) 1.5574269668684466
(1, 0) 5.457869447378502
(1, 1) 2.45013666158467
(1, 2) 1.2537218449109215
(1, 3) 3.214849992506807
(2, 0) 8.03575457318074
(2, 1) 8.652085375090703
(2, 2) 9.567927444793996
(2, 3) 10.406227956156409
(3, 0) 15.228759092018938
(3, 1) 18.673717484120644
(3, 2) 23.56172219688203
(3, 3) 29.99920315803337


We can see how the value function converges. After 10 iterations it differs significantly from the exact value which we saw in the last assignment. After 100 iterations it is very close to the true value function.

## Policy Iteration

In [12]:
def policy_iter(mdp: MDP, policy: Policy, n_iter: int) -> Policy:
    for i in range(n_iter):
        new_policy = policy
        v = policy_eval(mdp, policy, n_iter)
        for s in mdp.States:
            best_value = -1000000
            best_a: A
            for a in mdp.P[s]:
                # reinitialize the new policy
                new_policy[s][a] = 0.
                value = mdp.R[s][a]
                for sp in mdp.P[s][a]:
                    value += mdp.gamma*mdp.P[s][a][sp]*v[sp]
                if value > best_value:
                    best_value = value
                    best_a = a
            # make the policy deterministic
            new_policy[s][best_a] = 1.0
                    
        policy = new_policy
    return policy

### Example Policy Iteration
Implement Policy Iteration on the previous example.

In [13]:
new_policy = policy_iter(mdp, policy, 100)
value_policy = policy_eval(mdp, new_policy, 100)

In [14]:
for s in sorted(new_policy):
    print(s, new_policy[s], value_policy[s])

(0, 0) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 29.99920315803337
(0, 1) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 24.915096445321854
(0, 2) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 20.729548395641018
(0, 3) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 18.20145751300594
(1, 0) {1: 0.0, 2: 0.0, 3: 1.0, 4: 0.0} 24.93875485970657
(1, 1) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 21.196223380787025
(1, 2) {1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0} 16.99251986118538
(1, 3) {1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0} 20.729548395641018
(2, 0) {1: 0.0, 2: 0.0, 3: 1.0, 4: 0.0} 20.968761252197577
(2, 1) {1: 0.0, 2: 1.0, 3: 0.0, 4: 0.0} 19.03557817536556
(2, 2) {1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0} 21.19622338078703
(2, 3) {1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0} 24.915096445321854
(3, 0) {1: 0.0, 2: 1.0, 3: 0.0, 4: 0.0} 18.411498069982436
(3, 1) {1: 0.0, 2: 1.0, 3: 0.0, 4: 0.0} 20.968761252197577
(3, 2) {1: 0.0, 2: 1.0, 3: 0.0, 4: 0.0} 24.93875485970657
(3, 3) {1: 1.0, 2: 0.0, 3: 0.0, 4: 0.0} 29.99920315803337


Above we can see how the value of being in each state increases significantly compared to the original policy of just moving right.

## Value Iteration

In [15]:
def value_iter(mdp: MDP, n_iter: int) -> state_value_function:
    # implementation of value iteration, the code is very similar to that of policy iteration
    # the difference is what kind of information we store
    v = {}
    for s in mdp.States:
        # initialize the value function
        v[s] = 0
    
    for i in range(n_iter):
        # initialize new dictionary to store the values for this iteration
        new_v = {}
        for s in mdp.States:
            # variable to store the best value when looping over the actions
            best_value = -1000000
            for a in mdp.P[s]:
                # variable for storing the value for action a
                value = mdp.R[s][a]
                for sp in mdp.P[s][a]:
                    value += mdp.gamma*mdp.P[s][a][sp]*v[sp]
                if value > best_value:
                    best_value = value
            # store the best value
            new_v[s] = best_value
        # copy the value function    
        v = new_v

    return v

### Example Value Iteration
We will once again reuse the previous gridworld example.

In [16]:
# compare the value funcitons we get by using value iteration and policy iteration and evaluation
value_vi = value_iter(mdp, 100)
value_policy = policy_eval(mdp, new_policy, 100)

In [17]:
for s in sorted(value_vi):
    print("State: {}, ".format(s), "value iteration: {0:.4f}, policy iter/eval: {0:.4f}".format(value_vi[s], value_policy[s]))

State: (0, 0),  value iteration: 29.9992, policy iter/eval: 29.9992
State: (0, 1),  value iteration: 24.9151, policy iter/eval: 24.9151
State: (0, 2),  value iteration: 20.7295, policy iter/eval: 20.7295
State: (0, 3),  value iteration: 18.2015, policy iter/eval: 18.2015
State: (1, 0),  value iteration: 24.9388, policy iter/eval: 24.9388
State: (1, 1),  value iteration: 21.1962, policy iter/eval: 21.1962
State: (1, 2),  value iteration: 16.9925, policy iter/eval: 16.9925
State: (1, 3),  value iteration: 20.7295, policy iter/eval: 20.7295
State: (2, 0),  value iteration: 20.9688, policy iter/eval: 20.9688
State: (2, 1),  value iteration: 19.0356, policy iter/eval: 19.0356
State: (2, 2),  value iteration: 21.1962, policy iter/eval: 21.1962
State: (2, 3),  value iteration: 24.9151, policy iter/eval: 24.9151
State: (3, 0),  value iteration: 18.4115, policy iter/eval: 18.4115
State: (3, 1),  value iteration: 20.9688, policy iter/eval: 20.9688
State: (3, 2),  value iteration: 24.9388, policy

We can clearly see that both methods produce the same result.

## Appendix code
Code needed from previous assignments

In [1]:
from typing import TypeVar

S = TypeVar('S')
A = TypeVar('A')

In [2]:
from typing import NamedTuple, Any, Dict, Tuple, Set, Union
import numpy as np

class MP(NamedTuple):
    States: Set[S]
    P: Dict[S, Dict[S, float]]
        
        
class MRP(NamedTuple):
    mp: MP
    R: Union[Dict[S, float], Dict[S, Dict[S, float]]]
    gamma: float

In [3]:
class MDP(NamedTuple):
    States: Set[S]
    # the transitions depend on s, a, and s'
    # mapping from a state to a mapping of an action to a mapping of a state to a float (probability)
    P: Dict[S, Dict[A, Dict[S, float]]]
    Actions: A
    # reward is a function of the current state,  and the action
    R: Union[Dict[S, Dict[A, float]], Dict[S, Dict[A, Dict[S, float]]]]
    gamma: float

        
class Policy(NamedTuple):
    # state to action to a probability
    pi: Dict[S, Dict[A, float]]
        

class state_value_function(NamedTuple):
    vf: Dict[S,  float]
        
        
class action_value_function(NamedTuple):
    vf: Dict[S, Dict[A, float]]