# Exercise 5.3. 

![img](imgs/Screenshot_2018-05-21_20-37-36.png)
![img](imgs/Screenshot_2018-05-21_20-38-07.png)

In [1]:
import numpy as np
import itertools as IT
np.set_printoptions(precision=2, suppress=True)
from set_up_random_walk import Random_Walk

env = Random_Walk()

#initial parameters
gamma = env.gamma
alpha = env.alpha
R = env.R
P = env.P

N_states = env.N_states
N_actions = env.N_actions
terminal_states = env.terminal_states

v_ini = env.v_ini
q_ini = env.q_ini

pi_rp = env.pi_rp
pi_opt = env.pi_opt
pi_off = env.pi_off

In [2]:
from numpy.linalg import inv

# function to evaluate the state value(V) function of a certain policy
def eval_v(policy):
    P_pi = np.matmul(policy, P)
    R_pi = np.matmul(policy, R)
    return np.matmul(inv(np.identity(P_pi.shape[0]) - gamma * P_pi), R_pi)

# function to evaluate the state-action(Q) value function of a certain policy
def eval_q(policy):
    # product of the transition matrix with the policy
    P_aux = np.matmul(P, policy)
    return np.matmul(inv(np.identity(P_aux.shape[0]) - gamma * P_aux), R)

# rewards for a state
def R_state(state):
    return R[N_actions * state: N_actions * state + N_actions]

# transition probabilities for a state
def P_state(state):
    return np.flip(P[N_actions * state: N_actions * (state+ 1) ,],0)
    #return P[N_actions * state: N_actions * (state+ 1) ,]

def R_(state, action=None):
    if action == None:
        return R_state(state)
    else:
        return R_state(state)[action]

def P_(state, action=None, state_t1=None):
    if action == None:
        if state_t1 == None:
            return P_state(state)
        else:
            return P_state(state)[:, state_t1]
    else:
        if state_t1 == None:
            return P_state(state)[action,]
        else:
            return P_state(state)[action, state_t1]

def policy_(policy, state, action=None): 
    # We have to treat them different if they are lists or numpy matrices
    if type(policy)==list:
        if action is not None:
            return policy[state][state * N_actions + action]
        else:
            return [policy[state][state * N_actions + action_] for action_ in reversed(range(N_actions))]
    else:
        if action is not None:
            return policy[state,state * N_actions + action]
        else:
            return [policy[state,state * N_actions + action_] for action_ in reversed(range(N_actions))]

def generate_episode(policy, starting_pos=None):
    episode = []
    #if we dont have a starting position we choose randomly from the states
    s_t0 = np.zeros(N_states)
    if starting_pos==None:
        random_start = np.random.randint(N_states)
        s_t0[random_start] = 1
    elif isinstance(starting_pos, list): # when given an array
        #print('array')
        s_t0 = starting_pos
    elif isinstance(starting_pos, int):  # when given a number
        #print('number')
        random_start = starting_pos
        s_t0[random_start] = 1
    else:                                # assume is ndarray, lazy programming, sry
        #print('ndarray')                 # it wont work tho
        s_t0 = starting_pos
     
    s = np.random.choice(N_states, p=s_t0)
    r = None
    while s not in terminal_states:
        a = np.random.choice(N_actions, size=1, p=policy_(policy,s))
        r = R_(s,a)
        episode.append({'s':s,'a':a[0],'r':np.array(r)[0,0]})
        
        #next state is chosen from the transition matrix from state s taken action a
        #s = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())
        s = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())
    episode.append({'s':s,'a':0,'r':0})
    return episode

![img](imgs/4.3.3.png)

The TD method

In [3]:
N_episodes = 800

def temporal_difference_v(policy):
    v = np.zeros(N_states)#1
    for _ in range(N_episodes):#2,10
        random_start = np.random.randint(N_states)#3
        s = random_start
        while s not in terminal_states:#4,9
            a = np.random.choice(N_actions, size=1, p=policy_(policy,s))#5
            r = R_(s,a)#6
            s_t1 = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())#6
            v[s] = v[s] + alpha*(r+v[s_t1]-v[s])#7
            s = s_t1#8
    return v #11

The montecarlo method

In [4]:
def every_visit_MC_policy_evaluation_v(policy,starting_pos=None):
    i=0
    v = np.zeros(N_states)#1
    N = np.zeros(N_states)#2
    delta = float('inf')
    epsilon = 0.0001
    g=0
    while delta > epsilon:#3,13
        delta = 0 #4
        """Generate an episode using pi: {s0,a0,r1,....,sT-1,aT-2,rT} 
        Each episode is a list of dictionaries containing 's_t','a_t','r_t+1'
        
        episode = [{'s_t':s,'a_t':a,'r_t+1':r},....]
        T is the length of the episode
        """
        episode = generate_episode(policy,starting_pos)#5
        i+=1
        T = len(episode)
        #print('episode ', i,' length T: ',T)
        g = 0 #6
        for t in reversed(range(T)): #7
            
            s = episode[t]['s']
            r = episode[t]['r']
            g = gamma*g + r #8
            #print( '   s?',s,'a?',episode[t]['a'],'r?',r,'g?', g)
            N[s] += 1 #9
            v_old = v[s] #10
            v[s] = v[s] + (1/N[s])*(g - v[s]) #11
            delta = max(delta, np.abs(v_old-v[s])) #12
            
        #print(delta)
    return v  #14

In [15]:
v = temporal_difference_v(pi_rp)
print('\nSolving the with TD methods:\n')
print(v)
#If the V function is 0 repeat because it has not found any reward in the first episode and the delta still is 0
v_MC = every_visit_MC_policy_evaluation_v(pi_rp)
print('\nMonte-Carlo Policy evaluation every visit:\n')
print(v_MC)


Solving the with TD methods:

[0.   0.12 0.26 0.41 0.59 0.8  0.  ]

Monte-Carlo Policy evaluation every visit:

[0. 0. 0. 0. 0. 0. 0.]


*Again the unreliability of MC methods with this implementation makes it so you have to repeat some times before working*