In [3]:
import numpy as np
import random

# Define the Grid World Environment
class GridWorld:
    def __init__(self, size=(5, 5), start=(0, 0), goal=(4, 4), obstacles=[]):
        self.size = size
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.actions = ['up', 'down', 'left', 'right']
        self.state = start

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        row, col = self.state
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)

        if (row, col) in self.obstacles:
            return self.state, -1, False  # Hit an obstacle
        elif (row, col) == self.goal:
            return (row, col), 1, True  # Reached the goal

        self.state = (row, col)
        return self.state, 0, False  # Normal move with no reward

# Q-Learning algorithm
def q_learning(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    q_table = np.zeros(env.size + (len(env.actions),))

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(len(env.actions)))  # Explore action space
            else:
                action = np.argmax(q_table[state])  # Exploit learned values

            next_state, reward, done = env.step(env.actions[action])
            next_action = np.argmax(q_table[next_state])  # Greedy action for next state

            # Update Q-value
            q_table[state][action] += alpha * (reward + gamma * q_table[next_state][next_action] - q_table[state][action])

            state = next_state

    return q_table

# Initialize the Grid World
grid_world = GridWorld(obstacles=[(1, 1), (2, 2), (3, 3)])

# Train the agent
q_table = q_learning(grid_world)

# Display the final Q-table
print("Final Q-table:")
print(q_table)


Final Q-table:
[[[ 0.39308177  0.4782969   0.41502086  0.23360863]
  [ 0.         -0.99323415  0.3766807   0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.40867507  0.531441    0.39204919 -0.53865084]
  [ 0.          0.          0.          0.        ]
  [ 0.         -1.         -1.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.45657814  0.4433358   0.47478688  0.59049   ]
  [-0.40621767  0.6561      0.49717982 -0.42215985]
  [ 0.          0.          0.          0.        ]
  [ 0.         -0.468559   -0.74581342  0.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.52978326  0.          0.0474831   0.        ]
  [ 0.53320045  0.68266147  0.38708063  0.729     ]
  [-0.26634638  0.81        0.59183934 -0.25701489]
  [ 0.          0.          0.          0. 