In [6]:
import os
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPS = 0.2
EPOCHS = 30
BATCH_SIZE = 258
ACTOR_LR = 1e-4
CRITIC_LR = 1e-4
HIDDEN = 256
ENTROPY_COEF = 0.02
NUM_EPISODES = 2000
MAX_STEPS = 1000


# === Actor & Critic Networks ===
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, act_dim), nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)


class Critic(nn.Module):
    def __init__(self, obs_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, 1)
        )

    def forward(self, x):
        return self.net(x)


# === PPO Agent ===
class PPO:
    def __init__(self, obs_dim, act_dim):
        self.actor = Actor(obs_dim, act_dim)
        self.critic = Critic(obs_dim)
        self.opt_actor = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.opt_critic = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)

    def get_action(self, state):
        state = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0)
        probs = self.actor(state)
        dist = Categorical(probs)
        a = dist.sample()
        return a.item(), dist.log_prob(a).detach(), dist.entropy().detach()

    def compute_adv(self, rewards, values, dones):
        advs, rets = [], []
        gae, next_val = 0.0, 0.0
        for r, v, d in zip(reversed(rewards), reversed(values), reversed(dones)):
            delta = r + GAMMA * next_val * (1 - d) - v
            gae = delta + GAMMA * LAMBDA * gae * (1 - d)
            advs.insert(0, gae)
            rets.insert(0, gae + v)
            next_val = v
        advs = torch.tensor(advs, dtype=torch.float32)
        rets = torch.tensor(rets, dtype=torch.float32)
        return advs, rets

    def update(self, states, actions, old_logps, advs, rets):
        states = torch.tensor(np.array(states), dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        old_logps = torch.tensor(old_logps, dtype=torch.float32)
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        for _ in range(EPOCHS):
            for start in range(0, len(states), BATCH_SIZE):
                idx = slice(start, start + BATCH_SIZE)

                # New log-probs & entropy
                probs = self.actor(states[idx])
                dist = Categorical(probs)
                logps = dist.log_prob(actions[idx])
                entropy = dist.entropy().mean()

                # Early stop if KL explodes
                kl = (old_logps[idx] - logps).mean()
                if kl > 1.5 * CLIP_EPS:
                    break

                # PPO surrogate
                ratio = torch.exp(logps - old_logps[idx])
                s1 = ratio * advs[idx]
                s2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advs[idx]
                loss_actor = -(torch.min(s1, s2).mean() + ENTROPY_COEF * entropy)

                # Critic loss
                vals = self.critic(states[idx]).squeeze()
                loss_critic = nn.MSELoss()(vals, rets[idx])

                # Step actor
                self.opt_actor.zero_grad()
                loss_actor.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
                self.opt_actor.step()

                # Step critic
                self.opt_critic.zero_grad()
                loss_critic.backward()
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
                self.opt_critic.step()


# === Potential-Based Shaping ===
def phi(state, alpha=10., beta=5.):
    pos, vel = state
    pos_feat = (pos + 1.2) / 1.8  # normalize position to [0,1]
    vel_feat = abs(vel)
    return alpha * pos_feat + beta * vel_feat


def get_shaped_reward(state, next_state, base_reward):
    return base_reward + (GAMMA * phi(next_state) - phi(state))


# === State Normalization ===
def normalize_state(s):
    pos, vel = s
    p = (pos - (-1.2)) / (0.6 - (-1.2)) * 2 - 1
    v = (vel - (-0.07)) / (0.07 - (-0.07)) * 2 - 1
    return np.array([p, v], dtype=np.float32)


# === Training Loop ===
def train():
    os.makedirs("models", exist_ok=True)
    env = gym.make("MountainCar-v0")
    agent = PPO(env.observation_space.shape[0], env.action_space.n)

    scores = deque(maxlen=100)
    best_avg = -float('inf')

    for ep in range(NUM_EPISODES):
        raw_s, _ = env.reset()
        s = normalize_state(raw_s)
        done = False

        states, actions, logps, values, rewards, dones = [], [], [], [], [], []
        total_base = 0.0

        for t in range(MAX_STEPS):
            a, lp, ent = agent.get_action(s)
            v = agent.critic(torch.tensor(s).unsqueeze(0)).item()

            raw_s2, base_r, done, _, _ = env.step(a)
            shaped_r = get_shaped_reward(raw_s, raw_s2, base_r)

            states.append(s)
            actions.append(a)
            logps.append(lp)
            values.append(v)
            rewards.append(shaped_r)
            dones.append(done)

            raw_s, s = raw_s2, normalize_state(raw_s2)
            total_base += base_r

            if done:
                break

        advs, rets = agent.compute_adv(rewards, values, dones)
        agent.update(states, actions, logps, advs, rets)

        scores.append(total_base)
        avg_score = np.mean(scores)

        print(f'\rEpisode {ep:4d}\tAvg Score (base): {avg_score:7.2f}', end='')
        if ep % 100 == 0:
            print()

        # save best
        if len(scores) == 100 and avg_score > best_avg:
            best_avg = avg_score
            torch.save(agent.actor.state_dict(), 'models/ppo_actor_best.pth')
            torch.save(agent.critic.state_dict(), 'models/ppo_critic_best.pth')

        # solved?
        if len(scores) == 100 and avg_score >= -110:
            print(f"\nSolved at episode {ep - 100}! Avg Score: {avg_score:.2f}")
            torch.save(agent.actor.state_dict(), 'models/ppo_actor_solved.pth')
            torch.save(agent.critic.state_dict(), 'models/ppo_critic_solved.pth')
            break

    # final save
    torch.save(agent.actor.state_dict(), 'models/ppo_actor_final.pth')
    torch.save(agent.critic.state_dict(), 'models/ppo_critic_final.pth')
    env.close()


if __name__ == "__main__":
    train()


Episode    0	Avg Score (base): -1000.00
Episode   40	Avg Score (base): -982.200

  return F.mse_loss(input, target, reduction=self.reduction)


Episode  100	Avg Score (base): -559.72
Episode  200	Avg Score (base): -142.72
Episode  300	Avg Score (base): -134.19
Episode  400	Avg Score (base): -130.38
Episode  500	Avg Score (base): -133.18
Episode  600	Avg Score (base): -137.17
Episode  700	Avg Score (base): -129.43
Episode  800	Avg Score (base): -130.22
Episode  900	Avg Score (base): -134.97
Episode 1000	Avg Score (base): -132.81
Episode 1100	Avg Score (base): -126.12
Episode 1200	Avg Score (base): -134.07
Episode 1300	Avg Score (base): -130.31
Episode 1400	Avg Score (base): -126.36
Episode 1500	Avg Score (base): -124.77
Episode 1600	Avg Score (base): -126.01
Episode 1700	Avg Score (base): -127.56
Episode 1800	Avg Score (base): -126.84
Episode 1900	Avg Score (base): -131.53
Episode 1999	Avg Score (base): -129.30

In [16]:
import numpy as np
import torch
import gymnasium as gym
from collections import deque


# Load your trained models
def evaluate(model_path_actor, model_path_critic, num_episodes=10, render=False):
    env = gym.make("MountainCar-v0")
    if render:
        env = gym.make("MountainCar-v0", render_mode="human")

    # Initialize agent with same architecture
    agent = PPO(env.observation_space.shape[0], env.action_space.n)
    agent.actor.load_state_dict(torch.load(model_path_actor))
    agent.critic.load_state_dict(torch.load(model_path_critic))
    agent.actor.eval()
    agent.critic.eval()

    success_count = 0
    total_rewards = []
    steps_to_success = []

    for ep in range(num_episodes):
        raw_s, _ = env.reset()
        s = normalize_state(raw_s)
        done = False
        total_reward = 0
        step_count = 0

        for t in range(1000):  # Max steps per episode
            if render:
                env.render()

            with torch.no_grad():
                a, _, _ = agent.get_action(s)

            raw_s2, r, done, _, _ = env.step(a)
            s = normalize_state(raw_s2)
            total_reward += r
            step_count += 1

            if done:
                # Check if the car reached the goal (position >= 0.5)
                if raw_s2[0] >= 0.5:
                    success_count += 1
                    steps_to_success.append(step_count)
                    print(f"Episode {ep + 1}: Success! Reached in {step_count} steps")
                else:
                    print(f"Episode {ep + 1}: Failed (position: {raw_s2[0]:.2f})")
                break

        total_rewards.append(total_reward)

    env.close()

    success_rate = success_count / num_episodes * 100
    avg_steps = np.mean(steps_to_success) if steps_to_success else 0

    print("\n=== Evaluation Results ===")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"Average steps when successful: {avg_steps:.1f}")
    print(f"Average total reward: {np.mean(total_rewards):.2f}")

    return success_rate, avg_steps


# State normalization (same as in training)
def normalize_state(s):
    pos, vel = s
    p = (pos - (-1.2)) / (0.6 - (-1.2)) * 2 - 1
    v = (vel - (-0.07)) / (0.07 - (-0.07)) * 2 - 1
    return np.array([p, v], dtype=np.float32)


# PPO class (same as in training)
class PPO:
    def __init__(self, obs_dim, act_dim):
        self.actor = Actor(obs_dim, act_dim)
        self.critic = Critic(obs_dim)

    def get_action(self, state):
        state = torch.as_tensor(state, dtype=torch.float32).unsqueeze(0)
        probs = self.actor(state)
        dist = Categorical(probs)
        a = dist.sample()
        return a.item(), dist.log_prob(a).detach(), dist.entropy().detach()


class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, act_dim), nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)


class Critic(nn.Module):
    def __init__(self, obs_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x)


if __name__ == "__main__":
    # Evaluate the best model
    print("Evaluating best model...")
    evaluate('models/ppo_actor_best.pth', 'models/ppo_critic_best.pth', num_episodes=10)

    # Evaluate the solved model (if it exists)
    try:
        print("\nEvaluating solved model...")
        evaluate('models/ppo_actor_solved.pth', 'models/ppo_critic_solved.pth', num_episodes=10)
    except FileNotFoundError:
        print("Solved model not found - skipping")

    # Evaluate with rendering to visualize one episode
    print("\nVisualizing one episode...")
    evaluate('models/ppo_actor_best.pth', 'models/ppo_critic_best.pth', num_episodes=1, render=True)

Evaluating best model...
Episode 1: Success! Reached in 142 steps
Episode 2: Success! Reached in 110 steps
Episode 3: Success! Reached in 108 steps
Episode 4: Success! Reached in 112 steps
Episode 5: Success! Reached in 174 steps
Episode 6: Success! Reached in 160 steps
Episode 7: Success! Reached in 141 steps
Episode 8: Success! Reached in 112 steps
Episode 9: Success! Reached in 111 steps
Episode 10: Success! Reached in 154 steps

=== Evaluation Results ===
Success rate: 100.0%
Average steps when successful: 132.4
Average total reward: -132.40

Evaluating solved model...
Solved model not found - skipping

Visualizing one episode...
Episode 1: Success! Reached in 118 steps

=== Evaluation Results ===
Success rate: 100.0%
Average steps when successful: 118.0
Average total reward: -118.00
