# Efficient path study with Re-inforcement learning

In [1]:
import numpy as np

#### Creating list of all possible paths

In [3]:
path_list = [(0,1), (1,2), (2,3), 
               (0,4), (2,6), 
               (4,8), (6,10),
              (8,9), (9,10),
              (9,13), (10,14),
              (13,14), (14,15)]

#### Creating R-Matrix from the path list

In [5]:
goal = 15
MATRIX_SIZE = 16
R = np.matrix(np.ones(shape=(MATRIX_SIZE, MATRIX_SIZE)))
R *= -1          
                      
for point in path_list:
	# print(point)
	if point[1] == goal:
		R[point] = 100
	else:
		R[point] = 0
		
	if point[0] == goal:
		R[point[::-1]] = 100
	else:
		R[point[::-1]] = 0
		
R[goal, goal] = 100
          
R          

matrix([[ -1.,   0.,  -1.,  -1.,   0.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [  0.,  -1.,   0.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,   0.,  -1.,   0.,  -1.,  -1.,   0.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,   0.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [  0.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,   0.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,   0.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
           0.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [ -1.,  -1.,  -1.,  -1.,   0.,  -1.,  -1.,  -1.,  -1.,   0.,
          -

#### Creating a Q Matrix

In [7]:
Q = np.matrix(np.zeros([MATRIX_SIZE,MATRIX_SIZE]))

# Gamma (learning parameter).
gamma = 0.8

# Initial state (Usually to be choosen at random)
initial_state = 0


# This function returns all available actions in the state given as an argument
def available_actions(state):
	current_state_row = R[state,]
	av_act = np.where(current_state_row >= 0)[1]
	return av_act
	
# Get avaialble actions in the current state
available_act = available_actions(initial_state)

# This function chooses at random which action to be with in the range 
# of all the available actions
def sample_next_action(available_actions_range):
	next_action = int(np.random.choice(available_act, 1))
	return next_action
	
action = sample_next_action(available_act)

#This function updates the Q matrix according to the path selected 
#and the Q learning algorithm

def update(current_state, action, gamma):
	max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
	if max_index.shape[0] > 1:
		max_index = int(np.random.choice(max_index, size=1))
	else:
		max_index = int(max_index)
	max_value = Q[action, max_index]
	Q[current_state, action] = R[current_state, action] + gamma * max_value


av_index = [0,1,2,3,4,6,8,9,10,13,14,15]
    
for i in range(10000):
	current_state = int(np.random.choice(av_index, size=1))
	available_act = available_actions(current_state)
	action = sample_next_action(available_act)
	update(current_state, action, gamma)
    
print(Q)


[[  0.    163.84    0.      0.    163.84    0.      0.      0.      0.
    0.      0.      0.      0.      0.      0.      0.   ]
 [131.072   0.    204.8     0.      0.      0.      0.      0.      0.
    0.      0.      0.      0.      0.      0.      0.   ]
 [  0.    163.84    0.    163.84    0.      0.    256.      0.      0.
    0.      0.      0.      0.      0.      0.      0.   ]
 [  0.      0.    204.8     0.      0.      0.      0.      0.      0.
    0.      0.      0.      0.      0.      0.      0.   ]
 [131.072   0.      0.      0.      0.      0.      0.      0.    204.8
    0.      0.      0.      0.      0.      0.      0.   ]
 [  0.      0.      0.      0.      0.      0.      0.      0.      0.
    0.      0.      0.      0.      0.      0.      0.   ]
 [  0.      0.    204.8     0.      0.      0.      0.      0.      0.
    0.    320.      0.      0.      0.      0.      0.   ]
 [  0.      0.      0.      0.      0.      0.      0.      0.      0.
    0.      0.    

In [10]:
current_state = 0
steps = [current_state]

while current_state !=goal:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index
    
print(steps)    

[0, 4, 8, 9, 10, 14, 15]
