In [5]:
import numpy as np

# R Matrix

In [6]:
R = np.matrix([[-1,-1,-1,-1,0,-1],
               [-1,-1,-1,0,-1,100],
               [-1,-1,-1,0,-1,-1],
               [-1,0,0,-1,0,-1],
               [-1,0,0,-1,-1,100],
               [-1,0,-1,-1,0,100]])

In [7]:
R

matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [ -1,   0,   0,  -1,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

# Q Matrix

In [8]:
Q = np.matrix(np.zeros([6,6]))

In [9]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [10]:
gamma = 0.8
initial_state = 1

#### This function returns all available actions in the state given as an argument

In [11]:
def available_actions(state):
    current_state_row = R[state, ]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

#### Get available actions in the current state

In [12]:
available_action = available_actions(initial_state)

In [13]:
available_action

array([3, 5])

#### This function chooses at random which action to be performed within the range of all available actions

In [14]:
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_action,1))
    return next_action

#### Sample next action to be performed 

In [15]:
action = sample_next_action(available_action)

In [16]:
action

5

#### This function updates  Q matrix according to the path selected and the Q-learning algorithm

In [17]:
def update(current_state,action,gamma):
    max_index = np.where(Q[action,]==np.max(Q[action,]))[1]
    
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index,size=1))
    else:
        max_index = int(max_index)
    max_value=Q[action,max_index]
        
    # Q-learning formula
    Q[current_state,action] = R[current_state,action] + (gamma*max_value)
        
# Update Q matrix
update(initial_state,action,gamma)       

In [18]:
Q

matrix([[  0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0., 100.],
        [  0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.]])

#### Training

In [19]:
# Train cover 10000 iterations
for i in range(10000):
    current_state = np.random.randint(0,int(Q.shape[0]))
    available_action = available_actions(current_state)
    action = sample_next_action(available_action)
    update(current_state,action,gamma)
    
#normalize the trainee Q matrix
print("Trained Q matrix")
print(Q / np.max(Q)*100)

Trained Q matrix
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [21]:
current_state = 1
steps=[current_state]

while current_state !=5:
    next_step_index = np.where(Q[current_state,]==np.max(Q[current_state,]))[1]
    if next_step_index.shape[0]>1:
        next_step_index=int(np.random.choice(next_step_index,size=1))
    else:
        next_step_index=int(next_step_index)
    
    steps.append(next_step_index)
    current_state=next_step_index
    
#print selected step sequence
print("selected path: ")
print(steps)

selected path: 
[1, 5]
