## Example: Sam

### Value iteration and Q values

In [1]:
import numpy as np
nan=np.nan
T = np.array([
    [[0.95, 0.05], [0.7, 0.3]],
    [[0.5, 0.5], [0.1, 0.9]]
])
R = np.array([
    [[7., 7.], [10., 10.]],
    [[0., 0.], [2., 2.]]
])
#R2 = np.array([[7, 10], [0, 2]])
possible_actions = [[0, 1], [0, 1]]

In [2]:
Q = np.full((2, 2), -np.inf) # -inf for impossible actions
for state, actions in enumerate(possible_actions):
    Q[state, actions] = 0.0 # Initial value = 0.0, for all possible actions
discount_rate = 0.8
n_iterations = 1000
for iteration in range(n_iterations):
    Q_prev = Q.copy()
    for s in range(2):
        for a in possible_actions[s]:
            #Q[s, a] = R2[s, a] + np.sum([T[s, a, sp] * discount_rate * np.max(Q_prev[sp])
                                         #for sp in range(2)])
            Q[s, a] = np.sum([T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp])) for sp in range(2)])

In [3]:
Q

array([[ 35.0952381 ,  35.71428571],
       [ 23.80952381,  22.        ]])

### Q Learning

In [6]:
rnd = np.random
learning_rate0 = 0.7 # = (1 - alpha) in the lecture slides
learning_rate_decay = 0.1
n_iterations = 50000
s = 0 # start in state 0
Q = np.full((2, 2), -np.inf) # -inf for impossible actions
for state, actions in enumerate(possible_actions):
    Q[state, actions] = 0.0 # Initial value = 0.0, for all possible actions
for iteration in range(n_iterations):
    a = rnd.choice(possible_actions[s]) # choose an action (randomly)
    sp = rnd.choice(range(2), p=T[s, a]) # pick next state using T[s, a]
    reward = R[s, a, sp]
    learning_rate = learning_rate0 / (1 + iteration * learning_rate_decay)
    Q[s, a] = learning_rate * Q[s, a] + (1 - learning_rate) * (
        reward + discount_rate * np.max(Q[sp])
    )
    s = sp # move to next state

In [7]:
Q

array([[ 44.92561055,  39.55006615],
       [ 35.94052306,  37.94034806]])