# Exercise 4.1. 
Implement PE in Matlab and compute the value function
for each of the four deterministic policies and problem parameters given in
**Exercise 3.3**. Check that the result obtained through PE is consistent with
the theoretical result obtained in previous chapter.

![img](imgs/Screenshot_2018-04-20_11-23-41.png)

In [1]:
import numpy as np
from numpy.linalg import inv
import itertools as IT
np.set_printoptions(precision=2, suppress=True)
#initial parameters
gamma = .9
R = np.matrix([[-1],[.6],[.5],[-.9]])
P = np.matrix([[0.8,0.2],[0.2,0.8],[0.3,0.7],[0.9,0.1]])
pi1 = [[1,0,0,0],[0,0,1,0]]
pi2 = [[0,1,0,0],[0,0,1,0]]
pi3 = [[1,0,0,0],[0,0,0,1]]
pi4 = [[0,1,0,0],[0,0,0,1]]

N_states = 2
N_actions = 2
N_steps = 400

Helper Functions

In [2]:
# function to evaluate the state value(V) function of a certain policy
def eval_v(policy):
    P_pi = np.matmul(policy, P)
    R_pi = np.matmul(policy, R)

    return np.matmul(inv(np.identity(P_pi.shape[0]) - gamma * P_pi), R_pi)


# function to evaluate the state-action(Q) value function of a certain policy
def eval_q(policy):
    # product of the transition matrix with the policy
    P_aux = np.matmul(P, policy)

    return np.matmul(inv(np.identity(P_aux.shape[0]) - gamma * P_aux), R)

#rewards for a state
def R_state( state):
    return R[N_actions * state: N_actions * state + N_actions]

# transition probabilities for a state
def P_state( state):
    return P[N_actions * state: N_actions * state + N_actions, ]

def R_(state,action=None):
    if action ==None:
        return R_state(state)
    else:
        return R_state(state)[action]
    
def P_(state,action=None,state_t1=None):
    if action ==None :
        if state_t1 ==None :
            return P_state(state)
        else:
            return P_state(state)[:,state_t1]
    else:
        if state_t1 ==None :
            return P_state(state)[action,]
        else:
            return P_state(state)[action,state_t1]

def policy_(policy,state,action):
    return policy[state][state*N_actions+action]

![img](imgs/Screenshot_2018-05-07_13-33-43.png)

In [3]:
# We define the algorithm for policy evaluation with the state value function
def policy_evaluation_v(policy):
    # 1 
    v = np.zeros(N_states)
    # 2, 8 stop condition is not stated, instead we do 400 iterations
    for i in range(N_steps):
        # 3
        delta = 0
        # 4
        for state in range(N_states):
            # 5 
            v_old = v[state]
            # 6 
            v_aux = 0
            for action in range(N_actions):
                v_aux += policy_(policy, state, action) * (R_(state, action) + gamma * sum(
                    [P_(state, action, state_t1) * v[state_t1] for state_t1 in range(N_states)]))
            v[state] = v_aux
            # 7
            # delta = np.max(delta, np.abs(v_old - v[state]))

    # 8,9    
    return v

In [4]:
policy_evaluation_v(pi2)

array([5.34, 5.25])

In [5]:
eval_v(pi2)

matrix([[5.34],
        [5.25]])

![img](imgs/Screenshot_2018-05-07_13-34-23.png)

In [6]:
# We define the algorithm for policy evaluation with the state-action 
# value function
# We define the algorithm for policy evaluation with the state value function
def policy_evaluation_q(policy):
    # 1
    q = np.zeros(N_states* N_actions)
    # 2, 8 stop condition is not stated, instead we do 400 iterations
    for i in range(N_steps):
        # 3
        delta = 0
        # 4 TO-DO
        for state, action in IT.product(range(N_states), range(N_actions)):
            # 5
            q_old = q[state*N_actions + action]
            # 6
            q_aux = [R_(state, action)]
            for state_t1 in range(N_states):
                q_aux += gamma * P_(state, action, state_t1) * sum(
                    [policy_(policy, state_t1, action_t1) * q[state_t1*N_actions + action_t1] for action_t1 in
                     range(N_actions)])
            q[state*N_actions + action] = q_aux
            # 7
            # delta = np.max(delta, np.abs(q_old - q[state,action]))

    return q

In [7]:
policy_evaluation_q(pi2)

array([3.79, 5.34, 5.25, 3.9 ])

In [8]:
eval_q(pi2)

matrix([[3.79],
        [5.34],
        [5.25],
        [3.9 ]])