# Q-Learning Lab

- 6 states (rows)
- 4 actions (columns)

In [7]:
import numpy as np
import matplotlib.pyplot as plt

In [8]:
# Set Up Env.
grid_rows = 2
grid_cols = 3
num_actions = 4

actions = ['up', 'down','right', 'left']

# Define the reward matrix
rewards = np.array([
    [-10, 1, 0],
    [0, -5, 10]
])

# Define the Q-Table
q_table = np.zeros((6,num_actions,4))

# Set hyper-parameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate


In [9]:
print(q_table[3])

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [11]:
def is_terminal_state(cur_row_index, cur_col_index):
  if rewards[cur_row_index, cur_col_index ]== 10:
    return True
  else:
    return False


In [12]:
def get_start_loc():
  cur_row_index = np.random.randint(grid_rows)
  cur_col_index = np.random.randint(grid_cols)

  while is_terminal_state(cur_row_index, cur_col_index):
      cur_row_index = np.random.randint(grid_rows)
      cur_col_index = np.random.randint(grid_cols)

  return cur_row_index, cur_col_index


In [13]:
def get_next_act(cur_row_index, cur_col_index, epsilon):
  if np.random.random() < epsilon:
    return np.argmax(q_table[cur_row_index, cur_col_index])
  else:
    return np.random.randint(4)

In [14]:
def get_next_loc(cur_row_index, cur_col_index, action_index):
  new_row_index = cur_row_index
  new_col_index =cur_col_index

  if actions[action_index] == 'up' and cur_row_index == 1:
    new_row_index -= 1

  elif actions[action_index] == 'right' and cur_col_index < 2:
    new_col_index += 1

  elif actions[action_index] == 'down' and cur_row_index == 0:
    new_row_index += 1

  elif actions[action_index] == 'left' and cur_col_index > 0:
    new_col_index -= 1

  return new_row_index, new_col_index

In [15]:
def get_shortest_path(start_row, start_col):
  if is_terminal_state (start_row, start_col):
    return "meh"

  else:
    cur_row_index, cur_col_index = start_row, start_col
    path =[]

    path.append([cur_row_index,cur_col_index])

    while not is_terminal_state(cur_row_index,cur_col_index):
      action_index = get_next_act(cur_row_index,cur_col_index, 1)

      cur_row_index,cur_col_index = get_next_loc(cur_row_index,cur_col_index,action_index)

      path.append([cur_row_index,cur_col_index])
    return path


In [16]:
for episode in range(20000):
    row_index, col_index = get_start_loc()

    while not is_terminal_state(row_index, col_index):

        action_index = get_next_act(row_index, col_index, epsilon)

        old_row_index, old_col_index = row_index, col_index

        row_index, col_index = get_next_loc(row_index, col_index, action_index)

        reward = rewards[row_index, col_index]

        old_q_value = q_table[row_index, col_index, action_index]

        td = reward + (gamma * np.max(q_table[row_index, col_index])) - old_q_value

        new_q_value = old_q_value + (alpha * td)

        q_table[old_row_index, old_col_index, action_index] = new_q_value

print("training complete")


training complete


In [17]:
get_shortest_path(0,0)

KeyboardInterrupt: 