### Recall the following Q learning update rule:
![Q learning update rule](Q_learning_update_rule.png)

### And the following lecture example: 
![lecture example](example.png)

Assume $\gamma=0.9$. Then we implmenet the Q-learning for the example.

In [1]:
# import libraries
import numpy as np

# define reward function
def get_reward(state, action, next_state):
    rewards = np.array([0,0,0,0,0,100], dtype=float)
    return rewards[next_state]

# initial Q-table
Q = np.array(
    [ # up, right, down, left, stay
    [0,0,0,0,0],    # state 1
    [0,0,0,0,0],    # state 2
    [0,0,0,0,0],    # state 3
    [0,0,0,0,0],    # state 4
    [0,0,0,0,0],    # state 5
    [0,0,0,0,0],    # state G
    ]
    , dtype=float)

# define state and action dictionary for readability
action_dict = {0:"Up", 1:"Right", 2:"Down", 3:"Left", 4:"Stay"}
state_dict = {0:"State 1", 1:"State 2", 2:"State 3", 3:"State 4", 4:"State 5", 5:"State G"}

# define Q-table update rule
def update(state, action, next_state, gamma):
    reward = get_reward(state, action, next_state)
    new_q = reward + gamma * max(Q[next_state,:])

    # print details of computation
    print(state_dict[state], action_dict[action], state_dict[next_state])
    print(reward, "+", gamma, "* max(", ",".join(map(str, Q[next_state, :])), ")", "=", new_q)
    
    return new_q

In [2]:
# set hyperparameters
gamma = 0.9

# set state-action pairs for learning
paths = [ # state, actoin, next_state
        [0, 1, 1], # state 1, right, state 2
        [1, 1, 2], # state 2, right, state 3
        [2, 0, 5], # state 3, up, state G
        
        [0, 0, 3], # state 1, up, state 4
        [3, 1, 4], # state 4, right, state 5
        [4, 1, 5], # state 5, right, state G
        ]

# print results
print(Q, "\n")
for path in paths:
    state, action, next_state = path[0], path[1], path[2]
    Q[state, action] = update(state, action, next_state, gamma)
    print(Q, "\n")

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] 

State 1 Right State 2
0.0 + 0.9 * max( 0.0,0.0,0.0,0.0,0.0 ) = 0.0
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] 

State 2 Right State 3
0.0 + 0.9 * max( 0.0,0.0,0.0,0.0,0.0 ) = 0.0
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] 

State 3 Up State G
100.0 + 0.9 * max( 0.0,0.0,0.0,0.0,0.0 ) = 100.0
[[  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [100.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]] 

State 1 Up State 4
0.0 + 0.9 * max( 0.0,0.0,0.0,0.0,0.0 ) = 0.0
[[  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [100.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]] 

State 4 Right State 5
0.0 + 0.9 * max( 0.0,0.0,0.0,0.0,0.0 