# Exercise 4.2.
Implement PI in Matlab and compute the optimal policy and
optimal value function for the problem described in Exercise 3.3. Compare
the results obtained through PI with the theoretical result obtained in Exercise 3.3.
![img](imgs/Screenshot_2018-04-20_11-23-41.png)

In [1]:
import numpy as np
from random import uniform
np.set_printoptions(precision=2, suppress=True)
#initial parameters
gamma = .9
R = np.matrix([[-1],[.6],[.5],[-.9]])
P = np.matrix([[0.8,0.2],[0.2,0.8],[0.3,0.7],[0.9,0.1]])

N_states = 2
N_actions = 2
N_steps = 400
N_steps_pe = 20 
N_steps_pi = 50

Helper Functions

In [2]:
from numpy.linalg import inv
import itertools as IT


# function to evaluate the state value(V) function of a certain policy
def eval_v(policy):
    P_pi = np.matmul(policy, P)
    R_pi = np.matmul(policy, R)

    return np.matmul(inv(np.identity(P_pi.shape[0]) - gamma * P_pi), R_pi)


# function to evaluate the state-action(Q) value function of a certain policy
def eval_q(policy):
    # product of the transition matrix with the policy
    P_aux = np.matmul(P, policy)

    return np.matmul(inv(np.identity(P_aux.shape[0]) - gamma * P_aux), R)


# rewards for a state
def R_state(state):
    return R[N_actions * state: N_actions * state + N_actions]


# transition probabilities for a state
def P_state(state):
    return P[N_actions * state: N_actions * state + N_actions, ]


def R_(state, action=None):
    if action == None:
        return R_state(state)
    else:
        return R_state(state)[action]


def P_(state, action=None, state_t1=None):
    if action == None:
        if state_t1 == None:
            return P_state(state)
        else:
            return P_state(state)[:, state_t1]
    else:
        if state_t1 == None:
            return P_state(state)[action,]
        else:
            return P_state(state)[action, state_t1]


def policy_(policy, state, action=None):
    if action is not None:
        return policy[state][state * N_actions + action]
    else:
        return [policy[state][state * N_actions + action_] for action_ in range(N_actions)]
        

# We define the algorithm for policy evaluation with the state value function
def policy_evaluation_v(policy):
    # 1
    v = np.zeros(N_states)
    # 2, 8 stop condition is not stated, instead we do 400 iterations
    for i in range(N_steps):
        # 3
        delta = 0
        # 4
        for state in range(N_states):
            # 5
            v_old = v[state]
            # 6
            v_aux = 0
            for action in range(N_actions):
                v_aux += policy_(policy, state, action) * (R_(state, action) + gamma * sum(
                    [P_(state, action, state_t1) * v[state_t1] for state_t1 in range(N_states)]))
            v[state] = v_aux
            # 7
            # delta = np.max(delta, np.abs(v_old - v[state]))

    # 8,9
    return v


# We define the algorithm for policy evaluation with the state-action
# value function
# We define the algorithm for policy evaluation with the state value function
def policy_evaluation_q(policy):
    # 1
    q = np.zeros(N_states * N_actions)
    # 2, 8 stop condition is not stated, instead we do 400 iterations
    for i in range(N_steps):
        # 3
        delta = 0
        # 4 TO-DO
        for state, action in IT.product(range(N_states), range(N_actions)):
            # 5
            q_old = q[state * N_actions + action]
            # 6
            q_aux = [R_(state, action)]
            for state_t1 in range(N_states):
                q_aux += gamma * P_(state, action, state_t1) * sum(
                    [policy_(policy, state_t1, action_t1) * q[state_t1 * N_actions + action_t1] for action_t1 in
                     range(N_actions)])
            q[state * N_actions + action] = q_aux
            # 7
            # delta = np.max(delta, np.abs(q_old - q[state,action]))

    return q

## Generalized Policy Iteration framework
![img](imgs/Screenshot_2018-05-07_14-07-30.png)

## Policy Iteration for V functions
![img](imgs/Screenshot_2018-05-07_14-07-51.png)

In [3]:
# Policy iteration for state value function
def policy_iteration_v(policy, debug=False):
    v = np.zeros(N_states) #1
    theta = False
    while not theta: #2,3
        v = policy_evaluation_v(policy) # 4-9
        if debug : print('\n v:',v)
        theta = True #10
        if debug : print('For each state: ')
        for s in range(N_states): #11
            if debug : print('\n s:',s)
            a = policy_(policy, s) #12
            if debug : print('a:',a)  
            
            #13
            arg_max = np.argmax([(R_(s, a_t1) + gamma *
                                 np.sum([np.dot(P_(s, a_t1, s_t1), v[s_t1]) for s_t1 in range(N_states)]))
                                 for a_t1 in range(N_actions)])
            if debug : print('arg_max:',arg_max)
            policy[s] = [0] * len(policy[s])
            policy[s][N_actions * s + arg_max] = 1
            
            if debug : print('pi[s]:',policy[s])
            if debug : print('policy_(policy, s):',policy_(policy, s))
            if not (a == policy_(policy, s)): 
                theta = False #14
                if debug : print('a not equal to policy_(policy, s)')
                
            if debug : print('theta:',theta)
        if debug : print('\n pi:',policy)
    return policy #15

*-  I leave the prints commented in case of the need of debugging*

*-  The creation of this method was pretty intrincate, if I wanted to do the expression of line #13 in one line I would have to have nested a lot of commands. The way it is made is a compromise between readability and space( which also affects readability after all)*

*- To show that they work, i will print their inner workings step by step below*

In [4]:
#we generate 2 probabilities of taking each action from each state
p = uniform(0, 1)
q = uniform(0, 1)
# We generate the policy from those 2 probabilities, making sure no invalid actions are taken
pi_random = [[p,1 - p,0,0],[0,0,q,1 - q]]
print('pi_random: \n',np.array(pi_random))
policy_iteration_v(pi_random)
print('\n policy iteration with v: \n',np.array(policy_iteration_v(pi_random)))

pi_random: 
 [[0.68 0.32 0.   0.  ]
 [0.   0.   0.56 0.44]]

 policy iteration with v: 
 [[0 1 0 0]
 [0 0 1 0]]


## Policy Iteration for Q functions
![img](imgs/Screenshot_2018-05-07_14-08-03.png)

In [5]:
# Policy iteration for state-action value function
def policy_iteration_q(policy, debug=False):
    q = np.zeros(N_states * N_actions) #1
    theta = False
    while not theta: #2,3
        q = policy_evaluation_q(policy) # 4-9
        if debug : print('\n q:',q)
        theta = True #10
        # print('\n For each state: ')
        for s in range(N_states): #11
            if debug : print('\n s:',s)
            a = policy_(policy, s) #12
            if debug : print('a:',a)  
            
            #13
            arg_max = np.argmax([q[s * N_actions + a_t1]  for a_t1 in range(N_actions)])
            if debug : print('arg_max:',arg_max)
            policy[s] = [0] * len(policy[s])
            policy[s][N_actions * s + arg_max] = 1
            
            if debug : print('pi[s]:',policy[s])
            if debug : print('policy_(policy, s):',policy_(policy, s))
            if not (a == policy_(policy, s)): 
                theta = False #14
                if debug : print('a not equal to policy_(policy, s)')
                
            if debug : print('theta:',theta)
        if debug : print('pi:',policy)
    return policy #15


*- This was way easier to implement due to the fact that we dont have to loop over actions and states, but only over actions. This is because the q already contains this information.*

*- To show that they work, i will print their inner workings step by step below*

In [6]:
#we generate 2 probabilities of taking each action from each state
p = uniform(0, 1)
q = uniform(0, 1)
# We generate the policy from those 2 probabilities, making sure no invalid actions are taken
pi_random = [[p,1 - p,0,0],[0,0,q,1 - q]]
print('pi_random: \n',np.array(pi_random))
policy_iteration_q(pi_random)
print('\n policy iteration with v: \n',np.array(policy_iteration_v(pi_random)))

pi_random: 
 [[0.19 0.81 0.   0.  ]
 [0.   0.   0.28 0.72]]

 policy iteration with v: 
 [[0 1 0 0]
 [0 0 1 0]]


## Step by step policy iteration with V

In [None]:
#we generate 2 probabilities of taking each action from each state
p = uniform(0, 1)
q = uniform(0, 1)
# We generate the policy from those 2 probabilities, making sure no invalid actions are taken
pi_random = [[p,1 - p,0,0],[0,0,q,1 - q]]
print('pi_random: \n',np.array(pi_random))

print('\n policy iteration with v: \n',np.array(policy_iteration_v(pi_random,True)))

## Step by step policy iteration with Q

In [None]:
#we generate 2 probabilities of taking each action from each state
p = uniform(0, 1)
q = uniform(0, 1)
# We generate the policy from those 2 probabilities, making sure no invalid actions are taken
pi_random = [[p,1 - p,0,0],[0,0,q,1 - q]]
print('pi_random: \n',np.array(pi_random))

print('\n policy iteration with v: \n',np.array(policy_iteration_v(pi_random,True)))