In [1]:
import numpy as np

# --- 1. ENVIRONMENT (4x4 Grid World) ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15] # Top-Left, Bottom-Right
        self.actions = [0, 1, 2, 3] # UP, DOWN, LEFT, RIGHT

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)

        # Move Logic
        if action == 0:   row = max(row - 1, 0) # UP
        elif action == 1: row = min(row + 1, self.grid_size - 1) # DOWN
        elif action == 2: col = max(col - 1, 0) # LEFT
        elif action == 3: col = min(col + 1, self.grid_size - 1) # RIGHT

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. THE ALGORITHM: Q-Learning ---
def q_learning():
    env = GridWorld()

    # Parameters
    num_episodes = 5000
    alpha = 0.1   # Learning Rate
    gamma = 1.0   # Discount Factor (1.0 because we want shortest path)
    epsilon = 0.1 # Exploration Rate

    # Initialize Q-Table (16 States x 4 Actions)
    Q = np.zeros((16, 4))

    print("Training with Q-Learning (5000 Episodes)...")

    for _ in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            # 1. Choose Action (Epsilon-Greedy)
            if np.random.rand() < epsilon:
                action = np.random.choice(env.actions) # Explore
            else:
                action = np.argmax(Q[state]) # Exploit

            # 2. Take Action
            next_state, reward, done = env.step(state, action)

            # 3. Update Q-Value (Off-Policy)
            # We use max(Q[next_state]) regardless of what action we actually take next
            best_next_action_val = 0 if done else np.max(Q[next_state])

            # Update Rule: Q(S,A) = Q(S,A) + alpha * [ R + gamma * max_a Q(S',a) - Q(S,A) ]
            td_target = reward + gamma * best_next_action_val
            Q[state, action] += alpha * (td_target - Q[state, action])

            state = next_state

    return Q

# --- 3. EXECUTION & RESULTS ---
def print_policy(Q):
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}
    print("\nFinal Optimal Policy (Q-Learning):")
    print("-" * 17)

    grid_output = []
    for s in range(16):
        if s in [0, 15]:
            grid_output.append(" T ")
            continue
        # Simply pick the best action from the learned table
        best_action = np.argmax(Q[s])
        grid_output.append(f" {actions_map[best_action]} ")

    for i in range(0, 16, 4):
        print("|".join(grid_output[i:i+4]))
        print("-" * 17)

if __name__ == "__main__":
    q_table = q_learning()
    print_policy(q_table)

Training with Q-Learning (5000 Episodes)...

Final Optimal Policy (Q-Learning):
-----------------
 T | ← | ← | ↓ 
-----------------
 ↑ | ← | ↓ | ↓ 
-----------------
 ↑ | → | → | ↓ 
-----------------
 ↑ | → | → | T 
-----------------


### Code Explanation

This code implements a Q-learning algorithm to find an optimal policy for navigating a 4x4 grid world. Let's break it down:

1.  **`GridWorld` Class**: This defines the environment. It's a 4x4 grid where `0` and `15` are terminal states (goals). It defines possible actions (UP, DOWN, LEFT, RIGHT) and a `step` function that determines the `next_state`, `reward` (-1 for each step, encouraging shortest paths), and whether the episode is `done`.

2.  **`q_learning` Function**: This is the core reinforcement learning algorithm. It initializes a Q-table (state-action value table) to zeros. It then runs for a specified number of `episodes`:
    *   **Epsilon-Greedy Action Selection**: In each step, it either explores (picks a random action with probability `epsilon`) or exploits (picks the action with the highest Q-value for the current state with probability `1 - epsilon`).
    *   **Q-Value Update**: After taking an action and observing the `next_state` and `reward`, it updates the Q-value for the current `(state, action)` pair using the Bellman equation. This update incorporates the `learning rate (alpha)` and `discount factor (gamma)`.

3.  **`print_policy` Function**: After Q-learning is complete, this function takes the learned Q-table and visualizes the optimal policy. For each non-terminal state, it determines the action with the highest Q-value and prints it as an arrow, showing the agent's preferred move from that state.

In essence, the code trains an agent to find the shortest path from any starting state to a terminal state in the 4x4 grid.