In [None]:

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal

# Set random seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPSILON = 0.1
EPOCHS = 10
BATCH_SIZE = 128
LEARNING_RATE = 3e-4
MAX_TIMESTEPS = 1000000
REWARD_THRESHOLD = -200
N_EVAL_EPISODES = 10


# Running statistics for reward normalization
class RunningStat:
    def __init__(self):
        self.mean = 0
        self.std = 1
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x)
        batch_std = np.std(x) + 1e-8
        batch_count = len(x)

        self.count += batch_count
        delta = batch_mean - self.mean
        self.mean += delta * batch_count / self.count
        delta2 = batch_mean - self.mean
        self.std = np.sqrt(self.std ** 2 + (batch_std ** 2 - self.std ** 2) * batch_count / self.count + (
                    delta * delta2 * batch_count * (batch_count - 1)) / self.count ** 2)

    def normalize(self, x):
        return (x - self.mean) / self.std


# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(PolicyNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, act_dim)
        )
        self.log_std = nn.Parameter(torch.ones(act_dim) * 0.5)

    def forward(self, state):
        mean = self.net(state)
        std = torch.exp(self.log_std).clamp(min=0.1)
        dist = Normal(mean, std)
        return dist


# Value Network
class ValueNetwork(nn.Module):
    def __init__(self, obs_dim):
        super(ValueNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.net(state)


# Compute Generalized Advantage Estimation (GAE)
def compute_gae(rewards, values, next_value, dones):
    advantages = []
    gae = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + GAMMA * next_value * (1 - dones[t]) - values[t]
        gae = delta + GAMMA * LAMBDA * (1 - dones[t]) * gae
        advantages.insert(0, gae)
        next_value = values[t]
    return advantages


# PPO Update
def ppo_update(policy, value, optimizer_policy, optimizer_value, states, actions, old_log_probs, returns, advantages):
    # Convert lists to NumPy arrays
    states = np.array(states)
    actions = np.array(actions)
    old_log_probs = np.array(old_log_probs)
    returns = np.array(returns)
    advantages = np.array(advantages)

    # Print statistics for debugging
    print("Advantages mean:", np.mean(advantages), "std:", np.std(advantages))
    print("Returns mean:", np.mean(returns), "std:", np.std(returns))
    print("Policy log_std:", policy.log_std.detach().cpu().numpy())

    for _ in range(EPOCHS):
        indices = np.random.permutation(len(states))
        for start in range(0, len(states), BATCH_SIZE):
            batch_indices = indices[start:start + BATCH_SIZE]
            batch_states = torch.tensor(states[batch_indices], dtype=torch.float32).to(device)
            batch_actions = torch.tensor(actions[batch_indices], dtype=torch.float32).to(device)
            batch_old_log_probs = torch.tensor(old_log_probs[batch_indices], dtype=torch.float32).to(device)
            batch_returns = torch.tensor(returns[batch_indices], dtype=torch.float32).to(device)
            batch_advantages = torch.tensor(advantages[batch_indices], dtype=torch.float32).to(device)

            # Normalize advantages
            batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)

            # Policy update
            dist = policy(batch_states)
            new_log_probs = dist.log_prob(batch_actions).sum(dim=-1)
            ratio = torch.exp(new_log_probs - batch_old_log_probs)
            surr1 = ratio * batch_advantages
            surr2 = torch.clamp(ratio, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON) * batch_advantages
            policy_loss = -torch.min(surr1, surr2).mean()

            optimizer_policy.zero_grad()
            policy_loss.backward()
            torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=0.5)  # Gradient clipping
            optimizer_policy.step()

            # Value update
            value_pred = value(batch_states).squeeze()
            value_loss = (value_pred - batch_returns).pow(2).mean()

            optimizer_value.zero_grad()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(value.parameters(), max_norm=0.5)  # Gradient clipping
            optimizer_value.step()


# Evaluate the policy
def evaluate_policy(env, policy, n_episodes):
    total_rewards = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
            dist = policy(state_tensor)
            action = dist.sample()
            action_clamped = torch.clamp(action, -2.0, 2.0)
            state, reward, terminated, truncated, _ = env.step(action_clamped.detach().cpu().numpy()[0])
            done = terminated or truncated
            episode_reward += reward
        total_rewards.append(episode_reward)
    return np.mean(total_rewards)


# Main training loop
def train():
    env = gym.make("Pendulum-v1")
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    policy = PolicyNetwork(obs_dim, act_dim).to(device)
    value = ValueNetwork(obs_dim).to(device)
    optimizer_policy = optim.Adam(policy.parameters(), lr=LEARNING_RATE)
    optimizer_value = optim.Adam(value.parameters(), lr=LEARNING_RATE)

    reward_stat = RunningStat()
    state, _ = env.reset()
    total_steps = 0
    episode_rewards = []

    while total_steps < MAX_TIMESTEPS:
        states, actions, rewards, values, old_log_probs, dones = [], [], [], [], [], []
        episode_reward = 0
        raw_rewards = []  # For running stat update

        # Collect trajectory
        for _ in range(200):  # Max episode length
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
            dist = policy(state_tensor)
            action = dist.sample()
            action_clamped = torch.clamp(action, -2.0, 2.0)
            log_prob = dist.log_prob(action).sum(dim=-1)
            value_pred = value(state_tensor)

            next_state, reward, terminated, truncated, _ = env.step(action_clamped.detach().cpu().numpy()[0])
            done = terminated or truncated
            episode_reward += reward
            raw_rewards.append(reward)

            states.append(state)
            actions.append(action_clamped.detach().cpu().numpy()[0])
            rewards.append(reward_stat.normalize(reward))  # Normalize reward
            values.append(value_pred.item())
            old_log_probs.append(log_prob.item())
            dones.append(done)

            state = next_state
            total_steps += 1

            if done:
                state, _ = env.reset()
                episode_rewards.append(episode_reward)
                reward_stat.update(raw_rewards)  # Update running stats
                episode_reward = 0
                raw_rewards = []
                break

        # If episode didn't terminate, update reward stats
        if raw_rewards:
            reward_stat.update(raw_rewards)

        # Compute returns and advantages
        next_state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        next_value = value(next_state_tensor).item()
        advantages = compute_gae(rewards, values, next_value, dones)
        returns = [adv + val for adv, val in zip(advantages, values)]

        # Update policy and value networks
        ppo_update(policy, value, optimizer_policy, optimizer_value, states, actions, old_log_probs, returns,
                   advantages)

        # Evaluate and check stopping condition
        if len(episode_rewards) >= N_EVAL_EPISODES:
            avg_reward = np.mean(episode_rewards[-N_EVAL_EPISODES:])
            print(f"Steps: {total_steps}, Avg Reward: {avg_reward:.2f}")
            if avg_reward > REWARD_THRESHOLD:
                print(f"Goal reached! Average reward {avg_reward:.2f} exceeds threshold {REWARD_THRESHOLD}")
                break

    env.close()
    torch.save(policy.state_dict(), "ppo_policy.pth")
    torch.save(value.state_dict(), "ppo_value.pth")
    print("Models saved successfully!")

if __name__ == "__main__":
    train()


In [None]:
import torch
import gym
import numpy as np

# Load your policy network
policy = PolicyNetwork(obs_dim, act_dim).to(device)
policy.load_state_dict(torch.load("ppo_policy.pth"))
policy.eval()

# Create environment
env = gym.make("Pendulum-v1", render_mode="human")  # If you want to see
# env = gym.make("Pendulum-v1")  # No render if you just want numbers

# Set fixed seed for reproducibility
SEED = 42
env.reset(seed=SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

N_EVAL_EPISODES = 10
episode_rewards = []

for episode in range(N_EVAL_EPISODES):
    state, _ = env.reset(seed=SEED + episode)  # different seed per episode for variety but still reproducible
    done = False
    total_reward = 0

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            dist = policy(state_tensor)
            action = dist.mean  # <-- use mean action for deterministic behavior
            action_clamped = torch.clamp(action, -2.0, 2.0)

        next_state, reward, terminated, truncated, _ = env.step(action_clamped.squeeze(0).cpu().numpy())
        done = terminated or truncated
        total_reward += reward
        state = next_state

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}: Reward = {total_reward:.2f}")

env.close()

print("\n=== Final Evaluation ===")
print(f"Average Reward over {N_EVAL_EPISODES} episodes: {np.mean(episode_rewards):.2f}")
print(f"Reward Std Deviation: {np.std(episode_rewards):.2f}")
