In [1]:
import numpy as np

X = ['Tiger Left', 'Tiger Right']

A = ['Open Left', 'Open Right', 'Listen']

Z = ['Tiger Left', 'Tiger Right']

P_OL = np.ones([len(X), len(X)])/len(X) # uniform probability of tiger being behind any door
P_OR = P_OL
P_L = np.eye(len(X)) # when listening position of tiger doesn't change

O_OL = np.ones([len(X), len(X)])/len(X) # when door is opened world resets, back to tiger behind any door with uniform probability
O_OR = O_OL
O_L = np.array([[0.85, 0.15], [0.15, 0.85]]) # We can hear correct position of tiger with 0.85 probability

P = np.array([P_OL, P_OR, P_L])
O = np.array([O_OL, O_OR, O_L])

C = np.array([[1, 0, 0.1], [0, 1, 0.1]]) # Maximum cost if we pick wrong door, small cost for listening

In [2]:
p = np.linspace(0,1,101)
p = np.reshape(p, [p.size, 1])

B = np.hstack((p, 1-p)) # discrete belief space

In [3]:
def Policy_Iteration(P, c, gamma):
    
    pi = np.ones(c.shape)/c.shape[0]
    pi_prev = np.ones(c.shape)
    
    k = 0
    
    while not np.allclose(pi,pi_prev):
        
        pi_prev = pi
        
        P_pi = np.sum(P*pi, axis = 0)
        c_pi = np.sum(c*pi, axis = 0)
        
        J = np.matmul(np.linalg.inv(np.eye(P_pi.shape[0]) - gamma*P_pi), c_pi)
        
        Q = c + gamma * np.matmul(P,J)
        
        pi = np.isclose(Q, np.min(Q,axis = 0)).astype(int)

        pi = pi/np.sum(pi, axis = 0)
        
        k += 1
        
        
    pi = np.concatenate(pi, axis = 1)

    print(k)
    
    return pi

In [4]:
C

array([[1. , 0. , 0.1],
       [0. , 1. , 0.1]])

In [5]:
C = C.T.reshape([C.shape[1], C.shape[0], 1])

In [6]:
pi_MDP = Policy_Iteration(P,C,0.75)

2


In [7]:
MLS = np.argmax(B, axis = 1)

In [8]:
MLS.shape

(101,)

In [9]:
pi_MLS = np.zeros([MLS.size, pi_MDP.shape[1]])
i = 0
                   
for x in MLS:
    pi_MLS[i] = pi_MDP[x,:]
    i+=1

In [11]:
pi_MLS

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0