# Exercise 5.4. 

![img](imgs/5.4.1.png)
![img](imgs/5.4.2.png)


In [1]:
import numpy as np
import itertools as IT
np.set_printoptions(precision=2, suppress=True)
from set_up_random_walk import Random_Walk

env = Random_Walk()

#initial parameters
gamma = env.gamma
alpha = env.alpha
R = env.R
P = env.P

N_states = env.N_states
N_actions = env.N_actions
terminal_states = env.terminal_states

v_ini = env.v_ini
q_ini = env.q_ini

pi_rp = env.pi_rp
pi_opt = env.pi_opt
pi_off = env.pi_off

In [2]:
from numpy.linalg import inv

# function to evaluate the state value(V) function of a certain policy
def eval_v(policy):
    P_pi = np.matmul(policy, P)
    R_pi = np.matmul(policy, R)
    return np.matmul(inv(np.identity(P_pi.shape[0]) - gamma * P_pi), R_pi)

# function to evaluate the state-action(Q) value function of a certain policy
def eval_q(policy):
    # product of the transition matrix with the policy
    P_aux = np.matmul(P, policy)
    return np.matmul(inv(np.identity(P_aux.shape[0]) - gamma * P_aux), R)

# rewards for a state
def R_state(state):
    return R[N_actions * state: N_actions * state + N_actions]

# transition probabilities for a state
def P_state(state):
    return np.flip(P[N_actions * state: N_actions * (state+ 1) ,],0)
    #return P[N_actions * state: N_actions * (state+ 1) ,]

def R_(state, action=None):
    if action == None:
        return R_state(state)
    else:
        return R_state(state)[action]

def P_(state, action=None, state_t1=None):
    if action == None:
        if state_t1 == None:
            return P_state(state)
        else:
            return P_state(state)[:, state_t1]
    else:
        if state_t1 == None:
            return P_state(state)[action,]
        else:
            return P_state(state)[action, state_t1]

def policy_(policy, state, action=None): 
    # We have to treat them different if they are lists or numpy matrices
    if type(policy)==list:
        if action is not None:
            return policy[state][state * N_actions + action]
        else:
            return [policy[state][state * N_actions + action_] for action_ in reversed(range(N_actions))]
    else:
        if action is not None:
            return policy[state,state * N_actions + action]
        else:
            return [policy[state,state * N_actions + action_] for action_ in reversed(range(N_actions))]

def generate_episode(policy, starting_pos=None):
    episode = []
    #if we dont have a starting position we choose randomly from the states
    s_t0 = np.zeros(N_states)
    if starting_pos==None:
        random_start = np.random.randint(N_states)
        s_t0[random_start] = 1
    elif isinstance(starting_pos, list): # when given an array
        #print('array')
        s_t0 = starting_pos
    elif isinstance(starting_pos, int):  # when given a number
        #print('number')
        random_start = starting_pos
        s_t0[random_start] = 1
    else:                                # assume is ndarray, lazy programming, sry
        #print('ndarray')                 # it wont work tho
        s_t0 = starting_pos
     
    s = np.random.choice(N_states, p=s_t0)
    r = None
    while s not in terminal_states:
        a = np.random.choice(N_actions, size=1, p=policy_(policy,s))
        r = R_(s,a)
        episode.append({'s':s,'a':a[0],'r':np.array(r)[0,0]})
        
        #next state is chosen from the transition matrix from state s taken action a
        #s = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())
        s = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())
    episode.append({'s':s,'a':0,'r':0})
    return episode

# 1.

In [3]:
np.array(eval_v(pi_rp).T)

array([[-0.  ,  0.07,  0.15,  0.26,  0.43,  0.69,  0.  ]])

# 2.

In [4]:
#fsolve is broken ¯\_( ツ )_/¯ 

# 3.

In [5]:
N_steps = 400
# We define the algorithm for policy evaluation with the state value function
def policy_evaluation_v(policy):
    # 1 
    v = np.zeros(N_states)
    # 2, 8 stop condition is not stated, instead we do 400 iterations
    for i in range(N_steps):
        # 3
        delta = 0
        # 4
        for state in range(N_states):
            # 5 
            v_old = v[state]
            # 6 
            v_aux = 0
            for action in range(N_actions):
                v_aux += policy_(policy, state, action) * (R_(state, action) + gamma * sum(
                    [P_(state, action, state_t1) * v[state_t1] for state_t1 in range(N_states)]))
            v[state] = v_aux
            # 7
            # delta = np.max(delta, np.abs(v_old - v[state]))

    # 8,9    
    return v

In [6]:
policy_evaluation_v(pi_rp)

array([0.  , 0.07, 0.15, 0.26, 0.43, 0.69, 0.  ])

# 4.

In [7]:
# Policy iteration for state value function
def policy_iteration_v(pi_input, debug=False):
    policy = np.copy(pi_input)
    v = np.zeros(N_states) #1
    theta = False
    while not theta: #2,3
        v = policy_evaluation_v(policy) # 4-9
        if debug : print('\n v:',v)
        theta = True #10
        if debug : print('For each state: ')
        for s in range(N_states): #11
            if debug : print('\n s:',s)
            a = policy_(policy, s) #12
            if debug : print('a:',a)  
            
            #13
            arg_max = np.argmax([(R_(s, a_t1) + gamma *
                                 np.sum([np.dot(P_(s, a_t1, s_t1), v[s_t1]) for s_t1 in range(N_states)]))
                                 for a_t1 in range(N_actions)])
            if debug : print('arg_max:',arg_max)
            policy[s] = [0] * len(policy[s])
            policy[s,N_actions * s + arg_max] = 1
            
            if debug : print('pi[s]:',policy[s])
            if debug : print('policy_(policy, s):',policy_(policy, s))
            if not (a == policy_(policy, s)): 
                theta = False #14
                if debug : print('a not equal to policy_(policy, s)')
                
            if debug : print('theta:',theta)
        if debug : print('\n pi:',policy)
    return policy #15

In [8]:
env.plot_policy(policy_iteration_v(pi_rp))

['⬅️', '➡️', '➡️', '➡️', '➡️', '➡️', '⬅️']


In [9]:
def value_iteration_v(debug=False):
    if debug : print('Value Iteration with V:')
    policy = np.zeros([N_states,(N_states*N_actions)]) 
    v = np.zeros(N_states) #1
    epsilon = 0.01
    delta = 10000000
    while delta>epsilon: #2
        delta = 0 #3
        
        if debug : print('\n v:',v)
        for s in range(N_states): #4
            if debug : print('\n s:',s)
            
            v_old = v[s] #5
            if debug : print('\n v_old:',v_old)
            #6
            v[s]=np.max([(R_(s, a) + gamma *
                         np.sum([np.dot(P_(s, a, s_t1), v[s_t1]) for s_t1 in range(N_states)]))
                         for a in range(N_actions)])
            if debug : print('\n v[s]:',v[s])
            delta = max(delta, np.abs(v_old-v[s]))#8
            if debug : print('delta:',delta)

    # (out of the while) 
    if debug : print('\n For each state: \n')
    for s in range(N_states): #9
        #10
        arg_max = np.argmax([(R_(s, a) + gamma *
                             np.sum([np.dot(P_(s, a, s_t1), v[s_t1]) for s_t1 in range(N_states)]))
                             for a in range(N_actions)])
        if debug : print('arg_max:',arg_max)
        policy[s] = [0] * (N_states*N_actions)
        policy[s,N_actions * s + arg_max] = 1
            
        if debug : print('pi[s]:',policy[s])


    if debug : print('pi:',policy)
    return policy #11

In [10]:
env.plot_policy(value_iteration_v())

['⬅️', '➡️', '➡️', '➡️', '➡️', '➡️', '⬅️']


# 5.

In [11]:
N_episodes = 800

def temporal_difference_v(policy):
    v = np.zeros(N_states)#1
    for _ in range(N_episodes):#2,10
        random_start = np.random.randint(N_states)#3
        s = random_start
        while s not in terminal_states:#4,9
            a = np.random.choice(N_actions, size=1, p=policy_(policy,s))#5
            r = R_(s,a)#6
            s_t1 = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())#6
            v[s] = v[s] + alpha*(r+v[s_t1]-v[s])#7
            #print( '   s?',s,'a?',a,'r?',r,'v[s]?', v[s])
            s = s_t1#8
    return v #11

In [12]:
v = temporal_difference_v(pi_rp)
print('\nSolving the value function with TD methods:\n')
print(v)


Solving the value function with TD methods:

[0.   0.15 0.26 0.42 0.65 0.83 0.  ]


# 6.

![img](imgs/5.4.4.png)

In [13]:
N_episodes = 1400

def e_greedy_sample(q_of_s,epsilon):
    if np.random.rand() <= epsilon:
        return np.random.randint(N_actions)
    else:
        return np.argmax(q_of_s)


def q_learning_e_greedy(epsilon=0.1,step_size=alpha):
    q = np.random.random(N_states * N_actions) #1
    for terminal in terminal_states:
        q[terminal*N_actions:(terminal+1)*N_actions] *= 0 #2
    
    aprox_policy = np.zeros((N_states,N_states*N_actions))
    
    for _ in range(N_episodes):#3
        random_start = np.random.randint(N_states)#4
        s = random_start
        
        while s not in terminal_states:#5,10
            a = e_greedy_sample(q[s*N_actions:(s+1)*N_actions],epsilon)#6
            r = R_(s,a)#7
            s_t1 = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())#7
            
            q[s*N_actions+a] += step_size*(r+gamma*(
                    max([q[s_t1*N_actions+a_t1] for a_t1 in range(N_actions)])
                    - q[s*N_actions+a])) #8
            
            #print( '   s?',s,'a?',a,'r?',r,'v[s]?', v[s])
            s = s_t1#9
            
        for s in range(N_states):#11
            #print('state ', s)
            arg_max = np.argmax(q[s*N_actions:(s+1)*N_actions])
            #print('argmax ',arg_max)
            aprox_policy[s]=np.zeros(N_states*N_actions)
            aprox_policy[s][s*N_actions+arg_max]=1  #12
    print(q)        
    return aprox_policy #13



In [14]:
pi_q_learn = q_learning_e_greedy()
print(pi_q_learn)
env.plot_policy(pi_q_learn)

[0.   0.   0.17 1.05 0.68 1.11 0.87 1.11 1.   1.11 0.64 1.11 0.   0.  ]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
['⬅️', '➡️', '➡️', '➡️', '➡️', '➡️', '⬅️']


*Here if the iterations are too low it doesn't converge to the optimal policy,with experiments with 800 episodes it didn't converge always. But with 1400 it works perfectly fine*

# 7.

![img](imgs/5.4.3.png)

In [15]:
def sarsa_e_greedy(epsilon=0.1,step_size=alpha):
    q = np.random.random(N_states * N_actions) #1
    for terminal in terminal_states:
        q[terminal*N_actions:(terminal+1)*N_actions] *= 0 #2
    
    aprox_policy = np.zeros((N_states,N_states*N_actions))
    
    for _ in range(N_episodes):#3
        random_start = np.random.randint(N_states)#4
        s = random_start
        a = e_greedy_sample(q[s*N_actions:(s+1)*N_actions],epsilon)#6
        
        while s not in terminal_states:#6,11
            
            r = R_(s,a)#7
            s_t1 = np.random.choice(N_states, p=np.asarray(P_(s,a)).ravel())#7
            a_t1 = e_greedy_sample(q[s_t1*N_actions:(s_t1+1)*N_actions],epsilon)#8
            q[s*N_actions+a] += step_size*(r+gamma*q[s_t1*N_actions+a_t1]
                    - q[s*N_actions+a]) #9
            
            #print( '   s?',s,'a?',a,'r?',r,'v[s]?', v[s])
            s = s_t1#10
            a = a_t1#10
        for s in range(N_states):#11
            #print('state ', s)
            arg_max = np.argmax(q[s*N_actions:(s+1)*N_actions])
            #print('argmax ',arg_max)
            aprox_policy[s]=np.zeros(N_states*N_actions)
            aprox_policy[s][s*N_actions+arg_max]=1  #12
    print(q)        
    return aprox_policy #13


In [16]:
pi_sarsa = sarsa_e_greedy()
print(pi_sarsa)
env.plot_policy(pi_sarsa)

[0.   0.   0.23 0.56 0.29 0.66 0.4  0.76 0.49 0.87 0.53 1.   0.   0.  ]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
['⬅️', '➡️', '➡️', '➡️', '➡️', '➡️', '⬅️']
