In [1]:
import numpy as np

# Define the grid world environment
grid_size = (5, 5)  # Size of the grid world
start_state = (0, 0)  # Starting state
goal_state = (4, 4)  # Goal state
obstacle_states = [(1, 1), (2, 2), (3, 3)]  # Obstacle states

# Define the action space
actions = ['up', 'down', 'left', 'right']

# Initialize the Q-table
q_table = np.zeros((grid_size[0], grid_size[1], len(actions)))

# Set hyperparameters
learning_rate = 0.1
discount_factor = 0.9
num_episodes = 1000
max_steps_per_episode = 100

# Q-learning algorithm
for episode in range(num_episodes):
    state = start_state
    for step in range(max_steps_per_episode):
        # Choose an action using epsilon-greedy policy
        if np.random.uniform() < 0.1:
            action = np.random.choice(actions)
        else:
            action = actions[np.argmax(q_table[state])]
        
        # Perform the action and observe the next state and reward
        if action == 'up' and state[0] > 0:
            next_state = (state[0] - 1, state[1])
        elif action == 'down' and state[0] < grid_size[0] - 1:
            next_state = (state[0] + 1, state[1])
        elif action == 'left' and state[1] > 0:
            next_state = (state[0], state[1] - 1)
        elif action == 'right' and state[1] < grid_size[1] - 1:
            next_state = (state[0], state[1] + 1)
        else:
            next_state = state
        
        if next_state in obstacle_states:
            reward = -10
        elif next_state == goal_state:
            reward = 10
        else:
            reward = 0
        
        # Update the Q-value for the current state-action pair
        q_table[state][actions.index(action)] += learning_rate * (
            reward + discount_factor * np.max(q_table[next_state]) - q_table[state][actions.index(action)]
        )
        
        state = next_state
        
        if state == goal_state or state in obstacle_states:
            break

# Evaluate the learned policy
state = start_state
steps = 0
while state != goal_state and steps < max_steps_per_episode:
    action = actions[np.argmax(q_table[state])]
    
    if action == 'up' and state[0] > 0:
        next_state = (state[0] - 1, state[1])
    elif action == 'down' and state[0] < grid_size[0] - 1:
        next_state = (state[0] + 1, state[1])
    elif action == 'left' and state[1] > 0:
        next_state = (state[0], state[1] - 1)
    elif action == 'right' and state[1] < grid_size[1] - 1:
        next_state = (state[0], state[1] + 1)
    else:
        next_state = state
    
    state = next_state
    steps += 1

# Print the final path taken by the agent
if state == goal_state:
    print("Goal reached!")
else:
    print("Agent did not reach the goal.")

# Print the final Q-table
print("Final Q-table:")
print(q_table)


Agent did not reach the goal.
Final Q-table:
[[[  0.           0.           0.           0.        ]
  [  0.         -10.           0.           0.        ]
  [  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]]

 [[  0.           0.           0.          -9.41850263]
  [  0.           0.           0.           0.        ]
  [  0.          -5.6953279   -1.9          0.        ]
  [  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]]

 [[  0.           0.           0.           0.        ]
  [ -1.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.        ]]

 [[  0.           0.           0.           0.        ]
  [  0.           0.           0.           0.       