# Q-learning Algorithm

Muhammad Taqui
01-136221-021

In [None]:

import numpy as np
import random
import matplotlib.pyplot as plt

# Define environment size (11x11 grid)
environment_rows = 11
environment_columns = 11

# Define actions (up, down, left, right)
actions = ['up', 'right', 'down', 'left']
num_actions = len(actions)

# Define rewards (assume -100 for obstacles, -1 for aisles, and 100 for the goal)
rewards = np.full((environment_rows, environment_columns), -100.)  # Initialize all
rewards[0, 5] = 100.  # Goal location (item packaging area)

# Define aisles (allowed locations for robots)
aisles = {1: [i for i in range(1, 10)],
          2: [1, 7, 9],
          3: [i for i in range(1, 8)] + [9],
          4: [3, 7],
          5: [i for i in range(11)],
          6: [5],
          7: [i for i in range(1, 10)],
          8: [3, 7],
          9: [i for i in range(11)]}

# Set rewards for all aisle locations
for row_index in range(1, 10):
    for column_index in aisles[row_index]:
        rewards[row_index, column_index] = -1.

# Initialize Q-values (a 3D numpy array of zeros)
q_values = np.zeros((environment_rows, environment_columns, num_actions))

# Training parameters
epsilon = 0.9  # Exploration rate
discount_factor = 0.9
learning_rate = 0.9
episodes = 1000

def is_terminal_state(row_index, column_index):
    return rewards[row_index, column_index] == 100.

def get_next_action(row_index, column_index, epsilon):
    if random.random() < epsilon:
        return np.argmax(q_values[row_index, column_index])
    else:
        return random.randint(0, num_actions - 1)

def get_next_location(row_index, column_index, action_index):
    if actions[action_index] == 'up' and row_index > 0:
        row_index -= 1
    elif actions[action_index] == 'down' and row_index < environment_rows - 1:
        row_index += 1
    elif actions[action_index] == 'left' and column_index > 0:
        column_index -= 1
    elif actions[action_index] == 'right' and column_index < environment_columns - 1:
        column_index += 1
    return row_index, column_index

def run_episode():
    row_index, column_index = np.random.randint(1, environment_rows), np.random.randint(0, environment_columns)
    while not is_terminal_state(row_index, column_index):
        action_index = get_next_action(row_index, column_index, epsilon)
        old_row_index, old_column_index = row_index, column_index
        row_index, column_index = get_next_location(row_index, column_index, action_index)
        reward = rewards[row_index, column_index]
        old_q_value = q_values[old_row_index, old_column_index, action_index]
        temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value
        new_q_value = old_q_value + (learning_rate * temporal_difference)
        q_values[old_row_index, old_column_index, action_index] = new_q_value

for episode in range(episodes):
    run_episode()

def get_shortest_path(start_row_index, start_column_index):
    current_row_index, current_column_index = start_row_index, start_column_index
    path = [(current_row_index, current_column_index)]
    while not is_terminal_state(current_row_index, current_column_index):
        action_index = np.argmax(q_values[current_row_index, current_column_index])
        current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)
        path.append((current_row_index, current_column_index))
    return path

def plot_path(path, environment_rows, environment_columns):
    grid = np.zeros((environment_rows, environment_columns))
    grid[rewards == -100.] = -100

    plt.figure(figsize=(8, 8))
    plt.imshow(grid, cmap="Blues", origin="upper", extent=[0, environment_columns, 0, environment_rows])
    for i in range(len(path) - 1):
        y1, x1 = path[i]
        y2, x2 = path[i + 1]
        plt.arrow(x1, y1, x2 - x1, y2 - y1, head_width=0.2, head_length=0.3, fc='red', ec='red')
    plt.scatter(5, 0, color="green", label="Goal", s=100)
    plt.scatter(path[0][1], path[0][0], color="yellow", label="Start", s=100)
    plt.title("Shortest Path for Warehouse Robot")
    plt.xlabel("Columns")
    plt.ylabel("Rows")
    plt.legend()
    plt.xticks(range(environment_columns))
    plt.yticks(range(environment_rows))
    plt.grid(True)
    plt.gca().invert_yaxis()
    plt.show()

path_example = get_shortest_path(3, 9)
plot_path(path_example, environment_rows, environment_columns)
