In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# --- 1. ENVIRONMENT (4x4 Grid World) ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15]
        self.actions = [0, 1, 2, 3] # UP, DOWN, LEFT, RIGHT

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)
        if action == 0:   row = max(row - 1, 0)
        elif action == 1: row = min(row + 1, self.grid_size - 1)
        elif action == 2: col = max(col - 1, 0)
        elif action == 3: col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. NEURAL NETWORKS ---

class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(16, 128)
        self.fc2 = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        # Numerical stability fix: prevents NaN in softmax
        return F.softmax(x, dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(16, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

def state_to_tensor(state):
    v = torch.zeros(16)
    v[state] = 1.0
    return v.unsqueeze(0)

# --- 3. REINFORCE ALGORITHM ---
def train_reinforce_baseline():
    env = GridWorld()

    policy_net = PolicyNetwork()
    value_net = ValueNetwork()

    # Reduced learning rate slightly for stability
    policy_optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
    value_optimizer = optim.Adam(value_net.parameters(), lr=0.0005)

    num_episodes = 2000
    gamma = 0.99

    print("Training REINFORCE with Baseline")

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        log_probs = []
        values = []
        rewards = []

        # A. Generate Episode
        while not done:
            state_t = state_to_tensor(state)

            probs = policy_net(state_t)
            value = value_net(state_t)

            # Create distribution
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()

            # Step
            next_state, reward, done = env.step(state, action.item())

            log_probs.append(dist.log_prob(action))
            values.append(value)
            rewards.append(reward)

            state = next_state

            # Safety break if agent gets stuck in a loop
            if len(rewards) > 100:
                break

        # B. Calculate Returns
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        # C. Normalize Returns (Safe Mode)
        # Only normalize if we have more than 1 step, otherwise std is NaN
        if len(returns) > 1:
            returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        else:
            returns = returns - returns.mean()

        # D. Calculate Losses
        policy_loss = []
        value_loss = []

        for log_prob, value, G_t in zip(log_probs, values, returns):
            advantage = G_t - value.item()

            # Policy Loss
            policy_loss.append(-log_prob * advantage)

            # Value Loss (Fixing the warning by using detach/clone logic if needed)
            # We target the actual scalar G_t
            target = torch.tensor([G_t], dtype=torch.float32)
            value_loss.append(F.mse_loss(value.view(-1), target))

        policy_optimizer.zero_grad()
        value_optimizer.zero_grad()

        # Check if lists are not empty (in case of immediate termination)
        if policy_loss:
            loss_p = torch.stack(policy_loss).sum()
            loss_v = torch.stack(value_loss).sum()

            loss_p.backward()
            loss_v.backward()

            # GRADIENT CLIPPING (The Fix for Exploding Gradients)
            torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(value_net.parameters(), 1.0)

            policy_optimizer.step()
            value_optimizer.step()

        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed.")

    return policy_net

# --- 4. VISUALIZE ---
if __name__ == "__main__":
    trained_policy = train_reinforce_baseline()

    print("\nFinal Policy (REINFORCE):")
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}

    output_grid = []
    for s in range(16):
        if s in [0, 15]:
            output_grid.append(" T ")
            continue
        st = state_to_tensor(s)
        with torch.no_grad():
            probs = trained_policy(st)
            best_a = torch.argmax(probs).item()
        output_grid.append(f" {actions_map[best_a]} ")

    print("-" * 17)
    for i in range(0, 16, 4):
        print("|".join(output_grid[i:i+4]))
        print("-" * 17)

Training REINFORCE with Baseline
Episode 500/2000 completed.
Episode 1000/2000 completed.
Episode 1500/2000 completed.
Episode 2000/2000 completed.

Final Policy (REINFORCE):
-----------------
 T | ← | ← | ↓ 
-----------------
 ↑ | ↑ | → | ↓ 
-----------------
 ↑ | → | → | ↓ 
-----------------
 → | → | → | T 
-----------------


This code implements the REINFORCE reinforcement learning algorithm, enhanced with a baseline, to train an agent to navigate a 4x4 GridWorld environment. Let's break down its key components:

1.  **Environment (GridWorld)**: This class defines a 4x4 grid. The agent can move Up, Down, Left, or Right. States 0 and 15 are terminal states. Each step incurs a reward of -1, encouraging the agent to reach a terminal state as quickly as possible. The `step` method calculates the next state, reward, and whether the episode is finished, while `reset` places the agent in a random non-terminal starting state.

2.  **Neural Networks**: Two separate neural networks are defined using PyTorch:
    *   **`PolicyNetwork`**: This network takes a state as input (represented as a 16-element one-hot encoded vector) and outputs a probability distribution over the four possible actions (Up, Down, Left, Right). It uses a ReLU activation in its hidden layer and a Softmax activation in its output layer to ensure the outputs are valid probabilities.
    *   **`ValueNetwork`**: This network also takes a state as input and outputs a single value, representing the estimated value (expected cumulative reward) of being in that state. It uses a ReLU activation in its hidden layer.
    *   **`state_to_tensor`**: A helper function to convert an integer state representation into a one-hot encoded PyTorch tensor suitable for network input.

3.  **REINFORCE Algorithm (`train_reinforce_baseline`)**:
    *   **Initialization**: Sets up the `GridWorld` environment, initializes both the `PolicyNetwork` and `ValueNetwork`, and configures their respective Adam optimizers with a learning rate of 0.0005. It also defines `num_episodes` (2000) and the discount factor `gamma` (0.99).
    *   **Episode Generation (A)**: For each episode, the agent interacts with the environment. It uses the `PolicyNetwork` to sample an action, takes a step in the environment, and records the `log_probability` of the chosen action, the `value` predicted by the `ValueNetwork`, and the `reward` received. This continues until a terminal state is reached or a safety limit of 100 steps is exceeded.
    *   **Return Calculation (B)**: After an episode ends, the code calculates the discounted cumulative rewards (returns, `G`) for each step in the episode. This means `G_t` is the sum of future discounted rewards starting from time `t`.
    *   **Return Normalization (C)**: The calculated returns are then normalized (mean-subtracted and divided by standard deviation). This often helps stabilize training by making the target values for the networks more consistent.
    *   **Loss Calculation (D)**:
        *   **Advantage**: The `advantage` is calculated as the actual return (`G_t`) minus the estimated value of the state (`value.item()`). This term quantifies how much better or worse the actual outcome was compared to the network's prediction.
        *   **Policy Loss**: The policy network is updated using the REINFORCE objective with a baseline. The loss is calculated as `-log_prob * advantage`. If the advantage is positive (meaning the action led to a better-than-expected outcome), the probability of that action is increased. If the advantage is negative, the probability is decreased.
        *   **Value Loss**: The value network is updated using a Mean Squared Error (MSE) loss, comparing its predicted value for a state to the actual calculated return (`G_t`) for that state. This makes the value network a better predictor of future rewards.
    *   **Optimization**: After calculating losses for all steps in the episode, both policy and value losses are backpropagated, and their respective optimizers update the network weights. Gradient clipping is applied to prevent exploding gradients, which can be a common issue in reinforcement learning.

4.  **Visualization**: After training, the `if __name__ == "__main__":` block executes. It takes the `trained_policy` network and for each state in the grid (excluding terminal states), it uses the policy to determine the action with the highest probability. This best action is then printed in a grid format, showing the learned optimal path for the agent.