In [1]:
import numpy as np

# --- 1. THE ENVIRONMENT (Unknown to the agent initially) ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15] # Top-Left, Bottom-Right
        self.actions = [0, 1, 2, 3] # 0:UP, 1:DOWN, 2:LEFT, 3:RIGHT

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)

        # Action Logic
        if action == 0:   row = max(row - 1, 0) # UP
        elif action == 1: row = min(row + 1, self.grid_size - 1) # DOWN
        elif action == 2: col = max(col - 1, 0) # LEFT
        elif action == 3: col = min(col + 1, self.grid_size - 1) # RIGHT

        next_state = row * self.grid_size + col
        reward = -1 # Penalty for each step
        done = next_state in self.terminal_states

        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. THE ALGORITHM (Q-Learning) ---
def q_learning():
    env = GridWorld()

    # Parameters
    num_episodes = 5000
    alpha = 0.1   # Learning Rate (How fast we accept new info)
    gamma = 0.99  # Discount Factor (Importance of future rewards)
    epsilon = 0.1 # Exploration Rate (Chance to try random move)

    # Initialize Q-Table: 16 States x 4 Actions
    # Q[s, a] stores the value of taking action 'a' in state 's'
    Q = np.zeros((16, 4))

    print("Training with Q-Learning (5000 Episodes)...")

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            # A. Choose Action (Epsilon-Greedy)
            if np.random.rand() < epsilon:
                action = np.random.choice(env.actions) # Explore (Random)
            else:
                action = np.argmax(Q[state]) # Exploit (Best known action)

            # B. Take Action & Observe
            next_state, reward, done = env.step(state, action)

            # C. Update Q-Value (The Bellman Update Rule)
            # Old Value
            old_value = Q[state, action]
            # Best possible value from next state
            next_max = np.max(Q[next_state])

            # Formula: Q(s,a) = Q(s,a) + alpha * [Reward + gamma * max(Q(s')) - Q(s,a)]
            new_value = old_value + alpha * (reward + gamma * next_max - old_value)

            Q[state, action] = new_value

            state = next_state

    return Q

# --- 3. DISPLAY RESULTS ---
def print_results(Q):
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}
    print("\nLearned Policy (from Q-Table):")
    print("-" * 17)

    grid_output = []
    for s in range(16):
        if s in [0, 15]: # Terminal
            grid_output.append(" T ")
            continue

        # The best action is the one with the highest Q-value for this state
        best_action_idx = np.argmax(Q[s])
        grid_output.append(f" {actions_map[best_action_idx]} ")

    # Print nicely
    for i in range(0, 16, 4):
        print("|".join(grid_output[i:i+4]))
        print("-" * 17)

    print("\nExample Q-Values for State 1 (Next to Top-Left Goal):")
    print(f"UP: {Q[1,0]:.2f}, DOWN: {Q[1,1]:.2f}, LEFT: {Q[1,2]:.2f}, RIGHT: {Q[1,3]:.2f}")
    print("(Notice 'LEFT' should have the highest value because it leads to the goal)")

if __name__ == "__main__":
    final_Q = q_learning()
    print_results(final_Q)

Training with Q-Learning (5000 Episodes)...

Learned Policy (from Q-Table):
-----------------
 T | ← | ← | ↓ 
-----------------
 ↑ | ← | → | ↓ 
-----------------
 ↑ | → | → | ↓ 
-----------------
 ↑ | → | → | T 
-----------------

Example Q-Values for State 1 (Next to Top-Left Goal):
UP: -1.97, DOWN: -2.93, LEFT: -1.00, RIGHT: -2.94
(Notice 'LEFT' should have the highest value because it leads to the goal)


This code implements Q-learning, a fundamental reinforcement learning algorithm, to train an agent to navigate a simple 4x4 grid world. Let's break it down into three main parts:

1.  **The `GridWorld` Environment**: This class defines the problem the agent needs to solve. It's a 4x4 grid where the agent can move Up, Down, Left, or Right. States 0 (top-left) and 15 (bottom-right) are terminal states (goals). Every step the agent takes incurs a reward of -1, encouraging it to find the shortest path to a goal. The `step` method calculates the next state, reward, and whether the episode is `done` based on the agent's action.

2.  **The `q_learning` Algorithm**: This is the core of the reinforcement learning process.
    *   It initializes `Q`, a 16x4 table (16 states, 4 actions), to store the estimated 'quality' (future reward) of taking a specific action in a specific state.
    *   **Parameters**: `alpha` (learning rate) determines how much new information overrides old information. `gamma` (discount factor) emphasizes the importance of future rewards. `epsilon` (exploration rate) balances between exploring new actions and exploiting known good actions.
    *   **Training Loop**: Over 5000 episodes, the agent interacts with the environment. In each step:
        *   **Action Selection**: It uses an "epsilon-greedy" strategy: with `epsilon` probability, it picks a random action (exploration); otherwise, it chooses the action with the highest Q-value for the current state (exploitation).
        *   **Q-Value Update**: After observing the `reward` and `next_state` from its action, it updates the Q-value for the `(state, action)` pair using the Bellman equation. This formula (`Q(s,a) = Q(s,a) + alpha * [Reward + gamma * max(Q(s')) - Q(s,a)]`) iteratively improves the agent's knowledge of which actions are best in each state.

3.  **`print_results`**: This function visualizes the learned policy. For each non-terminal state, it determines the best action by finding the action with the highest Q-value and prints an arrow indicating that direction. Terminal states are marked with 'T'. It also shows an example of Q-values for a specific state to illustrate the learned values.