https://www.youtube.com/watch?v=LzaWrmKL1Z4

In [167]:
import numpy as np

In [168]:
# R matrix
# [State, Action]
# -1 means null value or no action
R = np.matrix([[-1, -1, -1, -1, 0, -1],
               [-1, -1, -1, 0, -1, 100],
               [-1, -1, -1, 0, -1, -1],
               [-1, 0, 0, -1, 0, -1],
               [-1, 0, 0, -1, -1, 100],
               [-1, 0, -1, -1, 0, 100]])
R

matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [ -1,   0,   0,  -1,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

In [169]:
# Q matrix
Q = np.matrix(np.zeros([6, 6]))
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [170]:
# learning parameter
gamma = 0.8

In [171]:
initial_state = 1

In [172]:
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act


In [173]:
available_act = available_actions(initial_state)

In [174]:
def sample_next_action(available_act):
    next_act = np.random.choice(available_act, 1)
    return next_act


In [175]:
action = sample_next_action(available_act)
print(action)

[5]


In [176]:
# Q learning formula
# Q(state, action) = R(state, action) + gamma * max(Q[next_state, all_actions])

In [177]:
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    if(max_index.shape[0] > 1):
        max_index = int(np.random.choice(max_index, 1))
    else:
        max_index = int(max_index)
        
    max_value = Q[action, max_index]
        
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * Q[action, max_index]

In [178]:
update(initial_state, action, gamma)

In [179]:
# Train
for i in range(10000):
    current_state = np.random.randint(0, Q.shape[0])
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

In [180]:
Q

matrix([[  0.,   0.,   0.,   0., 400.,   0.],
        [  0.,   0.,   0., 320.,   0., 500.],
        [  0.,   0.,   0., 320.,   0.,   0.],
        [  0., 400., 256.,   0., 400.,   0.],
        [  0., 400., 256.,   0.,   0., 500.],
        [  0., 400.,   0.,   0., 400., 500.]])

In [181]:
# Normalize trained q matrix
print(Q/np.max(Q)*100)

[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [182]:
# Testing
# from 2: 2 -> 3 -> 4 -> 5 or 2 -> 3 -> 1 -> 5
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if(next_step_index.shape[0] > 1):
        next_step_index = int(np.random.choice(next_step_index, 1))
    else:
        next_step_index = int(next_step_index)
        
    steps.append(next_step_index)
    current_state = next_step_index

In [183]:
print(steps)

[2, 3, 1, 5]
