In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

# --- 1. CONTINUOUS ENVIRONMENT (Simple 1D Target Seeking) ---
class ContinuousTargetEnv:
    def __init__(self):
        # State: Position on a line (Start at -2.0)
        # Goal: Reach 0.0
        self.state = np.array([-2.0], dtype=np.float32)
        self.max_steps = 200
        self.current_step = 0

    def reset(self):
        # Start at random position between -2 and -1
        self.state = np.array([np.random.uniform(-2, -1)], dtype=np.float32)
        self.current_step = 0
        return self.state

    def step(self, action):
        # Action is a continuous force [-1, 1]
        force = np.clip(action, -1.0, 1.0)

        # Dynamics: Position += Force * speed
        self.state[0] += force * 0.1

        # Calculate Reward (Negative distance to goal 0.0)
        dist = abs(self.state[0] - 0.0)
        reward = -dist

        # Check Done
        self.current_step += 1
        done = dist < 0.1 or self.current_step >= self.max_steps

        # Bonus reward for finishing
        if dist < 0.1:
            reward += 10.0

        return self.state, reward, done

# --- 2. ACTOR-CRITIC NETWORK ---
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        # Common layer
        self.fc1 = nn.Linear(1, 128)

        # ACTOR HEAD (Outputs Mean `mu` and Std Dev `sigma`)
        # Used to create a Normal Distribution (Gaussian)
        self.mu_head = nn.Linear(128, 1)
        self.sigma_head = nn.Linear(128, 1)

        # CRITIC HEAD (Outputs Value V(s))
        self.value_head = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))

        # Actor outputs
        mu = torch.tanh(self.mu_head(x)) # Output between -1 and 1
        sigma = F.softplus(self.sigma_head(x)) + 1e-5 # Always positive

        # Critic output
        value = self.value_head(x)

        return mu, sigma, value

# --- 3. A2C ALGORITHM (Continuous) ---
def train_a2c_continuous():
    env = ContinuousTargetEnv()
    model = ActorCritic()

    # We update both heads with one optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    episodes = 1000
    gamma = 0.99

    print("Training A2C for Continuous Control...")

    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            state_t = torch.FloatTensor(state)

            # 1. Forward pass
            mu, sigma, value = model(state_t)

            # 2. Sample Continuous Action from Normal Distribution
            dist = Normal(mu, sigma)
            action = dist.sample()

            # Clip action to valid range for environment
            action_numpy = action.detach().numpy()[0]

            # 3. Take Step
            next_state, reward, done = env.step(action_numpy)
            total_reward += reward

            # 4. Calculate Target (TD Target)
            next_state_t = torch.FloatTensor(next_state)

            with torch.no_grad():
                _, _, next_value = model(next_state_t)
                # If done, next value is 0
                target_value = reward + (0 if done else gamma * next_value.item())

            # 5. Calculate Advantage
            # Advantage = Target - Current_Prediction
            advantage = target_value - value

            # 6. Calculate Losses

            # Critic Loss: MSE(Target, Predicted)
            critic_loss = advantage.pow(2)

            # Actor Loss: -log_prob * advantage
            # (We detach advantage so we don't backprop through critic here)
            log_prob = dist.log_prob(action)
            actor_loss = -log_prob * advantage.detach()

            # Total Loss
            loss = actor_loss + critic_loss

            # 7. Update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            state = next_state

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/episodes: Total Reward = {total_reward:.2f}")

    return model

# --- 4. TEST ---
if __name__ == "__main__":
    trained_model = train_a2c_continuous()

    print("\nTesting Trained Policy (Moving from -2.0 to 0.0):")
    env = ContinuousTargetEnv()
    state = env.reset()

    for i in range(10):
        state_t = torch.FloatTensor(state)
        with torch.no_grad():
            mu, sigma, _ = trained_model(state_t)
            # In testing, we just use the Mean (mu) - no randomness
            action = mu.item()

        next_state, _, done = env.step(action)
        print(f"Step {i+1}: Pos {state[0]:.2f} -> Action {action:.2f} -> New Pos {next_state[0]:.2f}")
        state = next_state
        if done: break

Training A2C for Continuous Control...
Episode 100/episodes: Total Reward = -7.67
Episode 200/episodes: Total Reward = 4.42
Episode 300/episodes: Total Reward = 1.45
Episode 400/episodes: Total Reward = 5.27
Episode 500/episodes: Total Reward = 1.33
Episode 600/episodes: Total Reward = 0.10
Episode 700/episodes: Total Reward = -6.03
Episode 800/episodes: Total Reward = -1.79
Episode 900/episodes: Total Reward = -0.53
Episode 1000/episodes: Total Reward = 3.38

Testing Trained Policy (Moving from -2.0 to 0.0):
Step 1: Pos -1.82 -> Action 1.00 -> New Pos -1.82
Step 2: Pos -1.72 -> Action 1.00 -> New Pos -1.72
Step 3: Pos -1.62 -> Action 1.00 -> New Pos -1.62
Step 4: Pos -1.52 -> Action 1.00 -> New Pos -1.52
Step 5: Pos -1.42 -> Action 1.00 -> New Pos -1.42
Step 6: Pos -1.32 -> Action 1.00 -> New Pos -1.32
Step 7: Pos -1.22 -> Action 1.00 -> New Pos -1.22
Step 8: Pos -1.12 -> Action 1.00 -> New Pos -1.12
Step 9: Pos -1.02 -> Action 1.00 -> New Pos -1.02
Step 10: Pos -0.92 -> Action 1.00 -

## Explanation of the A2C for Continuous Control Implementation

This notebook demonstrates an Actor-Critic (A2C) algorithm for a simple continuous control problem: an agent learning to move towards a target position on a 1D line.

### 1. ContinuousTargetEnv: The Environment

This class defines a simple 1D environment where an agent needs to reach a target position (0.0). It simulates the agent's movement based on continuous actions.

*   **State:** A single float representing the agent's position on a line.
*   **Action:** A continuous value between -1.0 and 1.0, representing a force applied to the agent.
*   **Reward:** The negative absolute distance to the target (0.0), with a bonus for reaching close to the target.
*   **Reset:** Initializes the agent's position randomly between -2 and -1.
*   **Step:** Takes an action, updates the agent's position, calculates the reward, and determines if the episode is `done`.

### 2. ActorCritic: The Neural Network Model

This `nn.Module` implements the Actor-Critic architecture. It shares a common initial layer and then branches into two heads:

*   **Actor Head:** Responsible for outputting the parameters of a continuous probability distribution (specifically, a Normal distribution) from which actions are sampled. It outputs:
    *   `mu` (mean): The average action the agent believes is optimal.
    *   `sigma` (standard deviation): The uncertainty or exploration radius around `mu`.
*   **Critic Head:** Responsible for estimating the **value function** V(s), which represents the expected future reward from a given state `s`.

### 3. train_a2c_continuous: The A2C Training Algorithm

This function implements the Asynchronous Advantage Actor-Critic (A2C) algorithm for continuous action spaces. A2C is a policy gradient method that aims to optimize the agent's policy (how it chooses actions) by using a critic to estimate the value of states.

**Key Concepts:**

*   **Policy Gradient:** Methods that directly optimize the policy by estimating the gradient of the expected return.
*   **Actor-Critic:** Combines two components:
    *   **Actor:** The policy network that selects actions.
    *   **Critic:** The value network that estimates the value function.
*   **Advantage:** The advantage function `A(s, a) = Q(s, a) - V(s)` measures how much better a particular action `a` is than the average action from state `s`. In A2C, it's often approximated as `R + gamma * V(s') - V(s)` (TD error).
*   **Continuous Actions:** Instead of outputting discrete probabilities for each action, the actor outputs parameters (mean and standard deviation) of a continuous probability distribution (e.g., Normal distribution), and actions are sampled from this distribution.

**Training Steps:**

1.  **Forward Pass:** The current state is fed into the `ActorCritic` model to get `mu`, `sigma` (from actor), and `value` (from critic).
2.  **Sample Action:** An action is sampled from the Normal distribution defined by `mu` and `sigma`.
3.  **Take Step:** The sampled action is executed in the environment, yielding `next_state`, `reward`, and `done` flag.
4.  **Calculate TD Target:** The target value for the critic is calculated using the Bellman equation: `Reward + gamma * Value(next_state)`. If the episode is done, `Value(next_state)` is 0.
5.  **Calculate Advantage:** The advantage is the difference between the TD target and the current critic's prediction (`target_value - value`).
6.  **Calculate Losses:**
    *   **Critic Loss:** Typically Mean Squared Error between the predicted `value` and the `target_value` (`advantage.pow(2)`).
    *   **Actor Loss:** Calculated as `-log_prob * advantage`. The `log_prob` encourages actions that lead to higher advantage, and the `advantage.detach()` ensures that the critic's gradient doesn't flow through the actor.
7.  **Update:** The total loss (actor loss + critic loss) is backpropagated, and the optimizer updates the model's weights.

### 4. Testing the Trained Policy

After training, the `if __name__ == '__main__':` block tests the performance of the learned policy.

*   During testing, actions are typically not sampled randomly but are deterministically chosen as the mean (`mu`) of the distribution predicted by the actor. This is because the agent should exploit what it has learned.
*   The agent starts at a random negative position and attempts to reach 0.0. The output shows the agent's position, the chosen action, and the new position over several steps.