In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# --- 1. ENVIRONMENT (4x4 Grid World) ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15]
        self.actions = [0, 1, 2, 3] # UP, DOWN, LEFT, RIGHT

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)
        if action == 0:   row = max(row - 1, 0)
        elif action == 1: row = min(row + 1, self.grid_size - 1)
        elif action == 2: col = max(col - 1, 0)
        elif action == 3: col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. NEURAL NETWORKS ---
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(16, 128)
        self.fc2 = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(16, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

def state_to_tensor(state):
    v = torch.zeros(16)
    v[state] = 1.0
    return v.unsqueeze(0)

# --- 3. REINFORCE WITH ADVANTAGE ALGORITHM ---
def train_reinforce_advantage():
    env = GridWorld()

    # Initialize Actor (Policy) and Critic (Value)
    policy_net = PolicyNetwork()
    value_net = ValueNetwork()

    # Use small learning rate and gradient clipping for stability
    policy_optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
    value_optimizer = optim.Adam(value_net.parameters(), lr=0.0005)

    num_episodes = 2000
    gamma = 0.99

    print("Training REINFORCE using Advantage Function...")

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        log_probs = []
        values = []
        rewards = []

        # --- A. Collect Trajectory (Monte Carlo) ---
        while not done:
            state_t = state_to_tensor(state)

            # 1. Get Policy prob and Value estimate
            probs = policy_net(state_t)
            value = value_net(state_t)

            # 2. Sample Action
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()

            # 3. Take Step
            next_state, reward, done = env.step(state, action.item())

            log_probs.append(dist.log_prob(action))
            values.append(value)
            rewards.append(reward)

            state = next_state
            if len(rewards) > 100: break # Safety break

        # --- B. Calculate Returns (G_t) ---
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        # Normalize returns for numerical stability
        if len(returns) > 1:
            returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        else:
            returns = returns - returns.mean()

        # --- C. Calculate Advantage & Update ---
        policy_loss = []
        value_loss = []

        for log_prob, value, G_t in zip(log_probs, values, returns):
            # THE ADVANTAGE FUNCTION: A(s,a) = G_t - V(s)
            # We detach() value because we don't want to update the ValueNet based on Policy loss
            advantage = G_t - value.item()

            # Policy Update: Increase prob of actions with positive Advantage
            policy_loss.append(-log_prob * advantage)

            # Value Update: Make V(s) closer to actual G_t
            target = torch.tensor([G_t], dtype=torch.float32)
            value_loss.append(F.mse_loss(value.view(-1), target))

        # Backpropagation
        policy_optimizer.zero_grad()
        value_optimizer.zero_grad()

        if policy_loss:
            loss_p = torch.stack(policy_loss).sum()
            loss_v = torch.stack(value_loss).sum()

            loss_p.backward()
            loss_v.backward()

            # Gradient Clipping (Prevents Exploding Gradients/NaNs)
            torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(value_net.parameters(), 1.0)

            policy_optimizer.step()
            value_optimizer.step()

        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed.")

    return policy_net

# --- 4. VISUALIZE ---
if __name__ == "__main__":
    trained_policy = train_reinforce_advantage()

    print("\nFinal Policy (Advantage Method):")
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→'}

    output_grid = []
    for s in range(16):
        if s in [0, 15]:
            output_grid.append(" T ")
            continue
        st = state_to_tensor(s)
        with torch.no_grad():
            probs = trained_policy(st)
            best_a = torch.argmax(probs).item()
        output_grid.append(f" {actions_map[best_a]} ")

    print("-" * 17)
    for i in range(0, 16, 4):
        print("|".join(output_grid[i:i+4]))
        print("-" * 17)

Training REINFORCE using Advantage Function...
Episode 500/2000 completed.
Episode 1000/2000 completed.
Episode 1500/2000 completed.
Episode 2000/2000 completed.

Final Policy (Advantage Method):
-----------------
 T | ← | ← | ← 
-----------------
 ↑ | ↑ | ← | ↓ 
-----------------
 ↑ | → | → | ↓ 
-----------------
 ↑ | → | → | T 
-----------------


### Explanation of the REINFORCE with Advantage Algorithm

This notebook implements the REINFORCE algorithm with an Advantage Function, a type of policy gradient method in Reinforcement Learning. It aims to train an agent to navigate a simple 4x4 GridWorld environment. The agent learns an optimal policy (how to act) and a value function (how good a state is) using neural networks.

#### 1. Environment: GridWorld

The `GridWorld` class defines the environment where our agent will operate. It's a 4x4 grid.

-   **States:** There are 16 possible states (0 to 15), representing each cell in the grid.
-   **Terminal States:** States `0` (top-left) and `15` (bottom-right) are terminal states. Reaching them ends an episode.
-   **Actions:** The agent can take four actions: UP (0), DOWN (1), LEFT (2), RIGHT (3).
-   **`step(state, action)`:** This method simulates taking an action from a given state, returning the `next_state`, `reward` (-1 for each step, encouraging shortest paths), and `done` status.
-   **`reset()`:** Starts a new episode from a random non-terminal state.

#### 2. Neural Networks

Two neural networks are defined, representing the **Actor** (Policy) and the **Critic** (Value) components of the algorithm.

-   **`PolicyNetwork(nn.Module)` (Actor):**
    -   Takes a 16-dimensional one-hot encoded state vector as input.
    -   Outputs a probability distribution over the 4 possible actions using a `softmax` activation in the final layer.
    -   The agent uses this network to decide which action to take in a given state.

-   **`ValueNetwork(nn.Module)` (Critic):**
    -   Also takes a 16-dimensional one-hot encoded state vector as input.
    -   Outputs a single scalar value, which is its estimate of the *expected return* (total future reward) from that state.
    -   This network helps to evaluate the quality of actions taken by the policy network.

-   **`state_to_tensor(state)`:** A utility function to convert an integer state representation into a one-hot encoded PyTorch tensor, suitable for feeding into the neural networks.

#### 3. REINFORCE with Advantage Algorithm

The `train_reinforce_advantage()` function orchestrates the training process:

-   **Initialization:**
    -   Creates instances of `GridWorld`, `PolicyNetwork`, and `ValueNetwork`.
    -   Sets up `Adam` optimizers for both networks with a small learning rate (`0.0005`) for stability.
    -   Defines `num_episodes` (how many training runs) and `gamma` (discount factor for future rewards).

-   **Episode Loop:** The training runs for a specified number of `num_episodes`.
    -   For each episode, the environment is `reset()` to a starting state.
    -   **A. Collect Trajectory (Monte Carlo):**
        -   The agent interacts with the environment until a terminal state is reached or a safety break is triggered.
        -   In each step:
            1.  The current `state` is converted to a tensor.
            2.  The `policy_net` predicts action probabilities, and the `value_net` estimates the state's value.
            3.  An `action` is **sampled** from the policy's probability distribution (stochastic policy).
            4.  The `env.step()` method is called to get the `next_state`, `reward`, and `done` status.
            5.  The `log_prob` of the taken action, the `value` estimate, and the `reward` are stored.

    -   **B. Calculate Returns (G_t):**
        -   After an episode ends, the total discounted return (`G`) is calculated for each step in the collected trajectory.
        -   `G_t` is the sum of discounted future rewards from time step `t` onwards. This is a Monte Carlo approach as it waits until the end of the episode to calculate returns.
        -   The `returns` are then **normalized** to improve training stability.

    -   **C. Calculate Advantage & Update:**
        -   For each step in the trajectory, the **Advantage Function** `A(s,a) = G_t - V(s)` is calculated.
            -   `G_t`: The actual return observed from that state.
            -   `V(s)`: The value estimated by the `ValueNetwork` for that state.
            -   The advantage tells us how much *better* or *worse* the observed return `G_t` was compared to what the critic *predicted* `V(s)`.
        -   **Policy Update (Actor):**
            -   The policy loss is `-log_prob * advantage`. If the advantage is positive (meaning the action led to better-than-expected returns), the `log_prob` (and thus the probability) of that action is increased.
            -   The `value.item()` is `detach()`ed so that the policy loss does not affect the value network's gradients.
        -   **Value Update (Critic):**
            -   The value loss is calculated using Mean Squared Error (`F.mse_loss`) between the `value` predicted by the `ValueNetwork` and the actual observed `G_t` (target).
            -   This teaches the value network to more accurately predict the expected future returns.

    -   **Backpropagation:**
        -   Gradients are cleared for both optimizers.
        -   The `policy_loss` and `value_loss` are summed and `backward()` is called to compute gradients.
        -   **Gradient Clipping:** `torch.nn.utils.clip_grad_norm_` is applied to prevent exploding gradients.
        -   Optimizers take a `step()` to update the network weights.

-   Training progress is printed every 500 episodes.

#### 4. Visualization

After training, the `if __name__ == "__main__":` block executes:

-   It calls `train_reinforce_advantage()` to get the `trained_policy` network.
-   It then iterates through all 16 states of the grid.
-   For each non-terminal state, it feeds the state to the `trained_policy` network (using `torch.no_grad()` to disable gradient calculations as we are only inferring).
-   It identifies the action with the highest probability (`torch.argmax`) as the `best_a`.
-   Finally, it prints a formatted 4x4 grid where 'T' denotes terminal states and arrows (↑, ↓, ←, →) indicate the optimal action the trained policy would take in that state.