In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt

# Maze Representation (Grid)
# 0 = Empty, 1 = Wall, -1 = Fire, 2 = Flag
maze = [
    [0, 0, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, -1, 0, 0],
    [0, 0, 0, 1, 0, 2],
    [0, 1, 0, 0, 1, 0],
    [0, 0, 0, 0, 0, 0]
]

# Dimensions of the maze
rows, cols = len(maze), len(maze[0])

# Actions: up, down, left, right
actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]  # (row, col) changes

# Initialize Q-table
q_table = np.zeros((rows, cols, len(actions)))  # (state, action)

# Learning parameters
alpha = 0.1   # Learning rate
gamma = 0.9   # Discount factor
epsilon = 0.2 # Exploration rate
episodes = 1000

# Reward system
def get_reward(state):
    row, col = state
    if maze[row][col] == -1:  # Fire
        return -100
    elif maze[row][col] == 2:  # Flag
        return 100
    elif maze[row][col] == 1:  # Wall
        return -1
    else:  # Empty space
        return 1

# Check if the next state is valid (within maze bounds and not a wall)
def is_valid_state(state):
    row, col = state
    if 0 <= row < rows and 0 <= col < cols and maze[row][col] != 1:
        return True
    return False

# Q-learning Algorithm
for episode in range(episodes):
    # Start from a random empty cell
    start_state = (random.randint(0, rows-1), random.randint(0, cols-1))
    while maze[start_state[0]][start_state[1]] != 0:
        start_state = (random.randint(0, rows-1), random.randint(0, cols-1))

    state = start_state
    done = False
    
    while not done:
        # Exploration vs Exploitation
        if random.uniform(0, 1) < epsilon:
            action_idx = random.randint(0, len(actions)-1)  # Exploration
        else:
            action_idx = np.argmax(q_table[state[0], state[1]])  # Exploitation

        # Perform the chosen action
        action = actions[action_idx]
        next_state = (state[0] + action[0], state[1] + action[1])
        
        # Check if next state is valid
        if not is_valid_state(next_state):
            next_state = state  # Stay in the same state if invalid
        
        # Get the reward for the new state
        reward = get_reward(next_state)
        
        # Update the Q-value using the Q-learning formula
        best_next_action = np.argmax(q_table[next_state[0], next_state[1]])
        q_table[state[0], state[1], action_idx] += alpha * (reward + gamma * q_table[next_state[0], next_state[1], best_next_action] - q_table[state[0], state[1], action_idx])
        
        state = next_state  # Move to the next state
        
        # If the robot reaches the flag or fire, end the episode
        if reward == 100 or reward == -100:
            done = True

# After training, let's visualize the final Q-table.
# We'll show the Q-values for the actions at the starting position of the robot.

# Displaying Q-values for the starting position
start_pos = (2, 2)  # For example, start from (2,2)
plt.imshow(q_table[start_pos[0], start_pos[1], :].reshape


SyntaxError: unexpected EOF while parsing (2512198576.py, line 93)