In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# --- 1. ENVIRONMENT ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15]
        self.actions = [0, 1, 2, 3] # UP, DOWN, LEFT, RIGHT

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)

        if action == 0:   row = max(row - 1, 0)
        elif action == 1: row = min(row + 1, self.grid_size - 1)
        elif action == 2: col = max(col - 1, 0)
        elif action == 3: col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. THE NEURAL NETWORK ---
class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        # Input: 16 (One-hot encoding of the state)
        # Hidden: 128 neurons
        # Output: 4 (Q-values for UP, DOWN, LEFT, RIGHT)
        self.fc1 = nn.Linear(16, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 4)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# --- 3. HELPER: ONE-HOT ENCODING ---
def state_to_tensor(state):
    # Converts state integer (e.g., 5) to one-hot vector [0,0,0,0,0,1,0...]
    v = torch.zeros(16)
    v[state] = 1.0
    return v.unsqueeze(0) # Add batch dimension

# --- 4. THE ALGORITHM: DQN Training ---
def train_dqn():
    env = GridWorld()

    # Hyperparameters
    episodes = 1000
    gamma = 0.99
    epsilon = 1.0
    epsilon_decay = 0.995
    epsilon_min = 0.1
    learning_rate = 0.001
    batch_size = 32

    # Initialize Networks
    policy_net = QNetwork()
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Replay Buffer (Memory)
    memory = deque(maxlen=2000)

    print("Training DQN (this may take a moment)...")

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            state_tensor = state_to_tensor(state)

            # A. Select Action (Epsilon-Greedy)
            if random.random() < epsilon:
                action = random.choice(env.actions)
            else:
                with torch.no_grad():
                    q_values = policy_net(state_tensor)
                    action = torch.argmax(q_values).item()

            # B. Step
            next_state, reward, done = env.step(state, action)

            # C. Store in Memory
            memory.append((state, action, reward, next_state, done))
            state = next_state

            # D. Train (Experience Replay)
            if len(memory) > batch_size:
                minibatch = random.sample(memory, batch_size)

                # Prepare batch data
                states_b = torch.cat([state_to_tensor(x[0]) for x in minibatch])
                next_states_b = torch.cat([state_to_tensor(x[3]) for x in minibatch])

                # Get current Q values
                q_preds = policy_net(states_b)

                # Calculate Target Q values
                with torch.no_grad():
                    q_next = policy_net(next_states_b)

                target_q_values = q_preds.clone()

                for i, (s, a, r, ns, d) in enumerate(minibatch):
                    # Bellman Update: R + gamma * max(Q(s'))
                    target = r
                    if not d:
                        target += gamma * torch.max(q_next[i]).item()
                    target_q_values[i][a] = target

                # Gradient Descent
                loss = criterion(q_preds, target_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Decay Epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        if (episode+1) % 200 == 0:
            print(f"Episode {episode+1}/{episodes} completed.")

    return policy_net

# --- 5. TEST THE TRAINED MODEL ---
if __name__ == "__main__":
    trained_model = train_dqn()

    print("\nVisualizing DQN Policy:")
    print("-" * 17)
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}
    env = GridWorld()

    output_grid = []
    for s in range(16):
        if s in [0, 15]:
            output_grid.append(" T ")
            continue

        st = state_to_tensor(s)
        with torch.no_grad():
            q = trained_model(st)
            best_a = torch.argmax(q).item()
        output_grid.append(f" {actions_map[best_a]} ")

    for i in range(0, 16, 4):
        print("|".join(output_grid[i:i+4]))
        print("-" * 17)

Training DQN (this may take a moment)...
Episode 200/1000 completed.
Episode 400/1000 completed.
Episode 600/1000 completed.
Episode 800/1000 completed.
Episode 1000/1000 completed.

Visualizing DQN Policy:
-----------------
 T | ← | ← | ← 
-----------------
 ↑ | ← | ↓ | ↓ 
-----------------
 ↑ | ↓ | ↓ | ↓ 
-----------------
 ↑ | → | → | T 
-----------------


This code implements a Deep Q-Network (DQN) to solve a simple GridWorld environment. Let's break it down section by section:

1.  **GridWorld Environment (`GridWorld` class):** This class defines the environment. It's a 4x4 grid. States `0` and `15` are terminal (end states). Actions `0, 1, 2, 3` correspond to Up, Down, Left, Right. The `step` method takes a state and action, returns the next state, a reward (always -1 per step, encouraging shorter paths), and whether the episode is done. The `reset` method puts the agent in a random non-terminal starting state.

2.  **The Neural Network (`QNetwork` class):** This is a simple feedforward neural network built with PyTorch. It takes a one-hot encoded state (16 inputs for a 4x4 grid) and outputs 4 Q-values, one for each possible action (Up, Down, Left, Right). These Q-values estimate the expected future reward for taking a particular action in a given state.

3.  **One-Hot Encoding Helper (`state_to_tensor` function):** This function converts an integer state (e.g., state 5) into a one-hot vector (a tensor with a 1 at index 5 and 0s everywhere else). This format is suitable for input to the neural network.

4.  **DQN Training Algorithm (`train_dqn` function):** This is the core of the reinforcement learning agent:
    *   **Hyperparameters:** Defines settings like `episodes` (how many training runs), `gamma` (discount factor for future rewards), `epsilon` (for exploration-exploitation trade-off), `learning_rate`, and `batch_size` (for experience replay).
    *   **Network Initialization:** Creates the `policy_net` (the Q-network), an `optimizer` (Adam) to update its weights, and a `criterion` (MSELoss) to measure prediction error.
    *   **Replay Buffer (`deque`):** A memory where the agent stores its experiences (state, action, reward, next_state, done). This allows the agent to learn from past interactions in a more stable way by sampling random batches.
    *   **Training Loop:** Iterates through many episodes:
        *   **Epsilon-Greedy Action Selection:** The agent chooses an action. With probability `epsilon`, it takes a random action (exploration). Otherwise, it chooses the action with the highest Q-value predicted by the `policy_net` (exploitation).
        *   **Environment Step:** The agent takes the chosen action in the environment, getting a `next_state`, `reward`, and `done` flag.
        *   **Store in Memory:** The experience tuple is added to the `memory` buffer.
        *   **Experience Replay (Training):** Once enough experiences are in memory, a random `minibatch` is sampled. The network is then trained:
            *   It predicts Q-values for the states in the `minibatch` (`q_preds`).
            *   It calculates `target_q_values` using the Bellman equation (reward + discounted max Q-value of the next state). This is the 'correct' Q-value the network should predict.
            *   The `MSELoss` between `q_preds` and `target_q_values` is calculated, and the network's weights are updated via backpropagation and the optimizer.
        *   **Epsilon Decay:** `epsilon` gradually decreases over episodes, making the agent explore less and exploit more as it learns.

5.  **Test the Trained Model (`if __name__ == "__main__":` block):** After training, this section evaluates the learned policy. For each non-terminal state in the grid, it uses the `trained_model` to predict the best action (the one with the highest Q-value) and then prints a grid visualizing the optimal action for each state (e.g., '↑', '↓', '←', '→'). 'T' denotes a terminal state.

In essence, this code demonstrates how a neural network can learn to navigate an environment by approximating the optimal action-value function through trial and error, guided by a replay buffer and an epsilon-greedy exploration strategy.