In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Dimensions of the environment
environment_row_count = 11
environment_column_count = 11

# Definition of Q-values and rewards
q_values = np.zeros((environment_row_count, environment_column_count, 4))
rewards = np.full((environment_row_count, environment_column_count), -100.)
rewards[0, 5] = 100.  # High reward for the target point

In [2]:
# Definition of passages
passages = {1: [i for i in range(1, 10)], 2: [1, 7, 9], 3: [i for i in range(1, 8)] + [9],
            4: [3, 7], 5: [i for i in range(11)], 6: [5], 7: [i for i in range(1, 10)],
            8: [3, 7], 9: [i for i in range(11)]}

labirent:  [[-100 -100 -100 -100 -100  100 -100 -100 -100 -100]
 [-100   -1   -1   -1   -1   -1   -1   -1   -1 -100]
 [-100   -1 -100 -100 -100   -1 -100 -100   -1 -100]
 [-100   -1   -1   -1 -100   -1   -1   -1   -1   -1]
 [-100 -100 -100   -1 -100   -1 -100   -1 -100   -1]
 [-100   -1   -1   -1   -1   -1 -100   -1 -100 -100]
 [-100 -100 -100 -100   -1 -100 -100   -1   -1   -1]
 [-100 -100   -1   -1   -1   -1   -1   -1 -100   -1]
 [-100   -1 -100   -1 -100   -1 -100   -1   -1   -1]
 [-100   -1   -1   -1   -1   -1   -1   -1   -1   -1]
 [-100 -100 -100 -100 -100 -100 -100 -100 -100 -100]]


In [3]:
# Setting reward values for passages
for row_index in range(1, 10):
    for column_index in passages[row_index]:
        rewards[row_index, column_index] = -1.

In [7]:
# Function to check if a cell is an obstacle
def is_obstacle(current_row_index, current_column_index):
    return rewards[current_row_index, current_column_index] == -100.

In [8]:
# Function to determine the starting point
def set_starting_point():
    while True:
        current_row_index = np.random.randint(environment_row_count)
        current_column_index = np.random.randint(environment_column_count)
        if not is_obstacle(current_row_index, current_column_index):
            return current_row_index, current_column_index


In [8]:
def shortest_distance(start_row_index, start_column_index):
    if is_obstacle(start_row_index, start_column_index):
        return []
    else:
        path = [(start_row_index, start_column_index)]
        current_row, current_column = start_row_index, start_column_index
        # Choose the best move until reaching the target
        while rewards[current_row, current_column] < 100:
            move = np.argmax(q_values[current_row, current_column])
            current_row, current_column = move_to_next_point(current_row, current_column, move)
            path.append((current_row, current_column))
        return path

In [25]:
# Epsilon-greedy strategy for selecting a move
def determine_next_move(current_row_index, current_column_index, epsilon):
    if np.random.random() < epsilon:
        return np.argmax(q_values[current_row_index, current_column_index])
    else:
        return np.random.randint(4)

In [26]:
# Move to the new point based on the selected move
moves = ['up', 'right', 'down', 'left']
def move_to_next_point(current_row_index, current_column_index, move_index):
    new_row_index = current_row_index
    new_column_index = current_column_index
    if moves[move_index] == 'up' and current_row_index > 0:
        new_row_index -= 1
    elif moves[move_index] == 'right' and current_column_index < environment_column_count - 1:
        new_column_index += 1
    elif moves[move_index] == 'down' and current_row_index < environment_row_count - 1:
        new_row_index += 1
    elif moves[move_index] == 'left' and current_column_index > 0:
        new_column_index -= 1
    return new_row_index, new_column_index

In [30]:
# Q-learning training loop
epsilon = 0.9
decay_rate = 0.9
learning_rate = 0.9
for step in range(1000):
    row_index, column_index = set_starting_point()
    while not is_obstacle(row_index, column_index):
        move_index = determine_next_move(row_index, column_index, epsilon)
        old_row_index, old_column_index = row_index, column_index
        row_index, column_index = move_to_next_point(row_index, column_index, move_index)
        reward = rewards[row_index, column_index]
        old_q_value = q_values[old_row_index, old_column_index, move_index]
        difference = reward + (decay_rate * np.max(q_values[row_index, column_index])) - old_q_value
        new_q_value = old_q_value + (learning_rate * difference)
        q_values[old_row_index, old_column_index, move_index] = new_q_value
    if (step + 1) % 100 == 0:
        print(f"Step {step + 1} completed.")

print('Training completed.')


In [31]:
# Determine and print the route after training
post_training_row = int(input('Enter the row index where the robot will start moving: '))
post_training_column = int(input('Enter the column index where the robot will start moving: '))
shortest_path = shortest_distance(post_training_row, post_training_column)
print('Route to the target:', shortest_path)

In [32]:
# Create and visualize the maze after training
maze = np.zeros((environment_row_count, environment_column_count))

for row_index in range(1, 10):
    for column_index in passages[row_index]:
        maze[row_index, column_index] = 0  # Passages
maze[0, 5] = 1  # Target point
maze[maze == 0] = -1  # Obstacles

Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is completed!
Training is

In [None]:
# Visualization
maze = np.full((environment_row_count, environment_column_count), -1)  # Obstacles are set as default

# Place passages and the target
for row_index in range(1, 10):
    for column_index in passages[row_index]:
        maze[row_index, column_index] = 0  # Passages
maze[0, 5] = 10  # Target point

In [None]:
# Visualization
fig, ax = plt.subplots(figsize=(12, 12))

# Create maze ground: black for obstacles, white for passages
maze_ground = np.full((environment_row_count, environment_column_count), -1)  # Obstacles are set as default
for row_index in range(1, 10):
    for column_index in passages[row_index]:
        maze_ground[row_index, column_index] = 0  # Passages are set as white
maze_ground[0, 5] = 0.5  # Target point is set as light red

In [None]:
# Draw maze ground
cmap = plt.cm.colors.ListedColormap(['black', 'white', 'red'])
norm = plt.cm.colors.BoundaryNorm([-1, 0, 0.5, 1], cmap.N)
ax.imshow(maze_ground, cmap=cmap, norm=norm)

# Draw red circles for path points
for (row, column) in shortest_path:
    circle = plt.Circle((column, row), 0.3, color='red', fill=True)
    ax.add_patch(circle)


In [None]:
# Mark start and end points
starting_row, starting_column = shortest_path[0]
ending_row, ending_column = shortest_path[-1]
ax.plot(starting_column, starting_row, 'go', markersize=10)  # Start point is green
ax.plot(ending_column, ending_row, 'yo', markersize=10)  # End point is yellow

# Hide axes
ax.set_xticks