Reference:

https://mpatacchiola.github.io/blog/2016/12/09/dissecting-reinforcement-learning.html
https://github.com/mpatacchiola/dissecting-reinforcement-learning/tree/master/src/1

In [1]:
import numpy as np

In [3]:
# transition matrix loaded from file
# T[s, a, s'] = probability of transition from start state s, take action a, and enter state s'
# for this issue, there are 3*4 = 12 states, 4 actions (left, right, up, down)
T = np.load("resources\T.npy")

In [7]:
print(T.shape)
print(T[0, 0, 0])

(12, 12, 4)
0.9


In [10]:
T_tran = np.transpose(T, (0, 2, 1))
print(T_tran.shape)

(12, 4, 12)


In [11]:
u = np.array([0.812, 0.868, 0.918,   1.0,
                   0.762,   0.0, 0.660,  -1.0,
                   0.705, 0.655, 0.611, 0.388])

In [12]:
r = np.array([-0.04, -0.04, -0.04,   -0.04,
                   -0.04,   -0.04, -0.04, -0.04,
                   -0.04, -0.04, -0.04, 1])

In [21]:
print("u.shape: ", u.shape, "r.shape: ", r.shape)

u.shape:  (12,) r.shape:  (12,)


In [22]:
u_temp = T_tran.dot(u)
print(u_temp.shape)

(12, 4)


In [23]:
u_max = np.max(u_temp, axis=1)
print(u_max.shape)

(12,)


In [25]:
u_new = r + 1.0 * u_max
u_shaped = u_new.reshape((3, 4))
print(u_shaped)

[[ 0.8118  0.868   0.9178 -0.04  ]
 [ 0.762  -0.04    0.6604 -0.04  ]
 [ 0.7056  0.655   0.6111  1.4276]]


## Value Iteration

In [30]:
# action: 0=Up, 1=Left, 2=Down, 3=Right
def state_value(current_value, T, reward, gamma):
    """
    It's used in value iteration algorithm
    params:
        current_value: current value of each state. shape: (state_num, )
        T: transition matrix. shape: (state_num, state_num, action_num)
        reward: reward in each state. shape: (state_num, )
        gamma: discount factor. 0 <= gamma <=1.0
    
    return:
        new value of each state. shape: (state_num, ).
        the policy. shape: (state_num, )
    """
    
    T_tran = np.transpose(T, (0, 2, 1)) # T_tran shape: (state_num, action_num, state_num)
    expected_value = T_tran.dot(current_value) # shape: (state_num, action_num)
    new_value = reward + gamma * np.max(expected_value, axis = 1)
    policy = np.argmax(expected_value, axis = 1)
    return (new_value, policy)

In [27]:
reward = np.array([-0.04, -0.04, -0.04,  +1.0,
                   -0.04,   0.0, -0.04,  -1.0,
                   -0.04, -0.04, -0.04, -0.04])

In [46]:
def print_policy(policy, shape):
    """
    param:
        policy: action: 0=Up, 1=Left, 2=Down, 3=Right, -1=terminal, -2=obstacle. Shape: (state_num, )
        shape: tuple. (row, column)
    return: null
    """
    
    def action_translate(index):
        if index == -1: return " * "
        elif index == 0: return " ^ "
        elif index == 1: return " < "
        elif index == 2: return " v "
        elif index == 3: return " > "
        elif index == -2: return " # "
        else: return " - "
        
    v_action_translate = np.vectorize(action_translate)
    symbols = v_action_translate(policy)
    print(symbols.reshape(shape))

In [51]:
state_num = 12
gamma = 0.999 # different gamma, the optimal policy and value is different
iteration = 0
epsilon = 0.01 # stopping criteria small value

value_history = list()
policy_history = list()
value = np.zeros((state_num, ))
for i in range(100):
    delta = 0.0
    iteration += 1
    
    new_value, new_policy = state_value(value, T, reward, gamma)
    value_history.append(new_value)
    policy_history.append(new_policy)
    
    delta = np.max(np.abs(new_value - value))
    value = new_value

    if delta < epsilon * (1-gamma) / gamma:
        print("Iteration: %d, delta=%f" % (iteration, delta))
        print(value.reshape(3, 4))
        print("Policy: ")
        new_policy[5] = -2
        new_policy[3] = new_policy[7] = -1
        print_policy(new_policy, (3, 4))
        break;

Iteration: 26, delta=0.000010
[[ 0.80796343  0.86539911  0.91653199  1.        ]
 [ 0.75696619  0.          0.65836281 -1.        ]
 [ 0.69968237  0.64881928  0.60471582  0.38149581]]
Policy: 
[[' > ' ' > ' ' > ' ' * ']
 [' ^ ' ' # ' ' ^ ' ' * ']
 [' ^ ' ' < ' ' < ' ' < ']]


## Policy iteration

In [54]:
init_policy = np.random.randint(0, 4, size=(12))
init_policy[5] = -2
init_policy[3] = init_policy[7] = -1
print(init_policy)

[ 3  1  0 -1  1 -2  0 -1  0  1  3  2]


In [58]:
valid_policy = init_policy * (init_policy >= 0)
print(valid_policy)

[3 1 0 0 1 0 0 0 0 1 3 2]


In [71]:
u_temp = T_tran.dot(u)
print(u_temp)
value_from_policy = u_temp[np.arange(len(u_temp)), valid_policy]
print(value_from_policy)
print(value_from_policy.shape)

[[ 0.8176  0.807   0.7776  0.8518]
 [ 0.8674  0.8232  0.8674  0.908 ]
 [ 0.9212  0.8522  0.7148  0.9578]
 [ 0.      0.      0.      0.    ]
 [ 0.802   0.7613  0.7164  0.7613]
 [ 0.      0.      0.      0.    ]
 [ 0.7004  0.6809  0.4548 -0.6471]
 [ 0.      0.      0.      0.    ]
 [ 0.7456  0.7107  0.7     0.6707]
 [ 0.6556  0.695   0.6556  0.6198]
 [ 0.6323  0.6511  0.5931  0.4375]
 [-0.7001  0.4276  0.4103  0.2492]]
[ 0.8518  0.8232  0.9212  0.      0.7613  0.      0.7004  0.      0.7456
  0.695   0.4375  0.4103]
(12,)


In [82]:
invalid_policy = init_policy[init_policy < 0]
print(invalid_policy)

[-1 -2 -1]


In [83]:
def iterate_policy(current_policy, current_value, T, reward, gamma):
    """
    params:
        current_policy: shape: (state_num, )
        current_value: shape: (state_num, )
        reward: shape: (state_num, )
    return:
        new value and new policy. shape: (state_num, )
    """
    valid_policy = current_policy * (current_policy >= 0) # shape: (state_num,)
    T_tran = np.transpose(T, (0, 2, 1)) # T_tran shape: (state_num, action_num, state_num)
    expected_value = T_tran.dot(current_value) # shape: (state_num, action_num)
    value_from_current_policy = expected_value[np.arange(len(valid_policy)), valid_policy]
    new_value = reward + gamma * value_from_current_policy
    
    new_expected_value = T_tran.dot(new_value)
    new_policy = np.argmax(new_expected_value, axis = 1)
    new_policy[current_policy < 0] = current_policy[current_policy < 0]
    
    return (new_value, new_policy)

In [84]:
state_num = 12
gamma = 0.999
epsilon = 0.0001
iteration = 0

policy = np.random.randint(0, 4, size=(12))
policy[5] = -2
policy[3] = policy[7] = -1

value = np.zeros((state_num, ))

for i in range(100):
    iteration += 1
    (new_value, new_policy) = iterate_policy(policy, value, T, reward, gamma)
    
    delta = np.max(np.abs(new_value - value))
    
    value = new_value
    policy = new_policy
    
    if delta < epsilon * (1-gamma)/gamma:
        print("Iteration: %d, delta=%f" % (iteration, delta))
        print(value.reshape(3, 4))
        print("Policy: ")
        print_policy(policy, (3, 4))
        break;

Iteration: 33, delta=0.000000
[[ 0.80796344  0.86539911  0.91653199  1.        ]
 [ 0.75696624  0.          0.65836281 -1.        ]
 [ 0.69968297  0.64882108  0.60471974  0.38150427]]
Policy: 
[[' > ' ' > ' ' > ' ' * ']
 [' ^ ' ' # ' ' ^ ' ' * ']
 [' ^ ' ' < ' ' < ' ' < ']]
