In [1]:
import numpy as np
import random

# Define the gridworld environment
class GridWorld:
    def __init__(self):
        self.grid = np.array([
            [0, 0, 0, 1],  # Goal at (0, 3)
            [0, -1, 0, 0],  # Wall with reward -1
            [0, 0, 0, 0],
            [0, 0, 0, 0]  # Start at (3, 0)
        ])
        self.start_state = (3, 0)
        self.state = self.start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def is_terminal(self, state):
        return self.grid[state] == 1 or self.grid[state] == -1

    def get_next_state(self, state, action):
        next_state = list(state)
        if action == 0:  # Move up
            next_state[0] = max(0, state[0] - 1)
        elif action == 1:  # Move right
            next_state[1] = min(3, state[1] + 1)
        elif action == 2:  # Move down
            next_state[0] = min(3, state[0] + 1)
        elif action == 3:  # Move left
            next_state[1] = max(0, state[1] - 1)
        return tuple(next_state)

    def step(self, action):
        next_state = self.get_next_state(self.state, action)
        reward = self.grid[next_state]
        self.state = next_state
        done = self.is_terminal(next_state)
        return next_state, reward, done

In [2]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.q_table = np.zeros((4, 4, 4))  # Q-values for each state-action pair
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    def choose_action(self, state):
        if random.uniform(0, 1) < self.exploration_rate:
            return random.randint(0, 3)  # Explore
        else:
            return np.argmax(self.q_table[state])  # Exploit

    def update_q_value(self, state, action, reward, next_state):
        max_future_q = np.max(self.q_table[next_state])  # Best Q-value for next state
        current_q = self.q_table[state][action]
        # Q-learning formula
        self.q_table[state][action] = current_q + self.learning_rate * (
            reward + self.discount_factor * max_future_q - current_q
        )

In [3]:
env = GridWorld()
agent = QLearningAgent()

episodes = 1000  # Number of training episodes

for episode in range(episodes):
    state = env.reset()  # Reset the environment at the start of each episode
    done = False

    while not done:
        action = agent.choose_action(state)  # Choose an action
        next_state, reward, done = env.step(action)  # Take the action and observe next state, reward
        agent.update_q_value(state, action, reward, next_state)  # Update Q-values
        state = next_state  # Move to the next state

In [4]:
print("\nFinal Q-Table after training:")
print(agent.q_table)



Final Q-Table after training:
[[[ 7.10394511e-01  8.10000000e-01  6.17742392e-01  6.70627290e-01]
  [ 6.59905364e-01  9.00000000e-01 -8.78423345e-01  6.11958250e-01]
  [ 8.01522770e-01  1.00000000e+00  7.27334078e-01  7.36863483e-01]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 7.29000000e-01 -7.94108868e-01  5.35717755e-01  6.03478213e-01]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 8.99580361e-01  3.09510000e-02  4.98611247e-02 -1.00000000e-01]
  [ 4.68559000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 6.56100000e-01  4.34590991e-01  4.96902229e-01  5.49572012e-01]
  [-1.00000000e-01  6.87216977e-01  0.00000000e+00  0.00000000e+00]
  [ 7.97462914e-01  6.93360000e-03  0.00000000e+00  1.34702198e-01]
  [ 7.92189000e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 5.90490000e-01  2.38702818e-01  5.02647627e-01  4.29490451e-01]
  [ 4.67547365e-01  1.66970632e-04  0.00000000e+00  0.00000000e+00]
  [ 1.11076

In [6]:
state = env.reset()
done = False
print("\nTesting agent after training...\n")

while not done:
    action = np.argmax(agent.q_table[state])  # Choose best action based on learned Q-values
    next_state, reward, done = env.step(action)
    print(f"State: {state}, Action: {action}, Reward: {reward}")
    state = next_state

print("\nAgent reached terminal state.")



Testing agent after training...

State: (3, 0), Action: 0, Reward: 0
State: (2, 0), Action: 0, Reward: 0
State: (1, 0), Action: 0, Reward: 0
State: (0, 0), Action: 1, Reward: 0
State: (0, 1), Action: 1, Reward: 0
State: (0, 2), Action: 1, Reward: 1

Agent reached terminal state.
