#### Import required libraries

In [1]:
import numpy as np

#### R Matrix

In [2]:
R = np.matrix([[-1, -1, -1, -1, 0, -1], 
               [-1, -1, -1, 0, -1, 100], 
               [-1, -1, -1, 0, -1, -1], 
               [-1, 0, 0, -1, 0, -1], 
               [-1, 0, 0, -1, -1, 100], 
               [-1, 0, -1, -1, 0, 100]])

In [3]:
R

matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [ -1,   0,   0,  -1,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

#### Q Matrix

In [4]:
Q = np.matrix(np.zeros([6, 6]))

In [5]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [6]:
gamma = 0.8
initial_state = 1

#### This function returns all available actions in the state given as an argument

In [7]:
def available_actions(state):
    current_state_row = R[state, ]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

#### Get available actions in the current state

In [8]:
available_action = available_actions(initial_state)

In [9]:
available_action

array([3, 5], dtype=int64)

#### This funciton chooses at random which action to be performed within the range of all available actions

In [10]:
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_action, 1))
    return next_action

#### Sample next action to be performed

In [11]:
action = sample_next_action(available_action)

In [12]:
action

3

#### This function updates the Q-Matrix according to the path selected and the Q-Learning Algorithm

In [13]:
def update(current_state, action, gamma):
    max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
    
    if max_index.shape[0] >1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    # Q Learning Formula
    Q[current_state, action] = R[current_state, action] + (gamma * max_value)
    
# Update Q Matrix
update(initial_state, action, gamma)

In [14]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

#### Training

In [15]:
# Train over 10000 interations
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_action = available_actions(current_state)
    action = sample_next_action(available_action)
    update(current_state, action, gamma)
    
# Normalize the trained Q-Matrix
print("Trained Q Matrix")
print(Q / np.max(Q) * 100)

Trained Q Matrix
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [17]:
current_state = 1
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state, ] == np.max(Q[current_state, ]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size = 1))
    else:
        next_step_index = int(next_step_index)
        
    steps.append(next_step_index)
    current_state = next_step_index
    
# Print selected step sequence
print("Selected Path: ", steps)

Selected Path:  [1, 5]
