In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical

GAMMA = 0.99
LAMBDA = 0.95
CLIP_EPS = 0.2
EPOCHS = 10
BATCH_SIZE = 64
ACTOR_LR = 3e-4
CRITIC_LR = 1e-3
HIDDEN = 128

class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, act_dim), nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)

class Critic(nn.Module):
    def __init__(self, obs_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), nn.ReLU(),
            nn.Linear(HIDDEN, 1)
        )

    def forward(self, x):
        return self.net(x)

class PPO:
    def __init__(self, obs_dim, act_dim):
        self.actor = Actor(obs_dim, act_dim)
        self.critic = Critic(obs_dim)
        self.opt_actor = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.opt_critic = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def compute_adv(self, rewards, values, dones):
        adv, ret, gae, next_val = [], [], 0, 0
        for r, v, d in zip(reversed(rewards), reversed(values), reversed(dones)):
            delta = r + GAMMA * next_val * (1 - d) - v
            gae = delta + GAMMA * LAMBDA * gae * (1 - d)
            adv.insert(0, gae)
            ret.insert(0, gae + v)
            next_val = v
        return torch.tensor(adv), torch.tensor(ret)

    def update(self, states, actions, old_logps, advs, rets):
        states = torch.tensor(np.array(states), dtype=torch.float32)
        actions = torch.tensor(actions)
        old_logps = torch.tensor(old_logps)

        for _ in range(EPOCHS):
            for i in range(0, len(states), BATCH_SIZE):
                idx = slice(i, i + BATCH_SIZE)
                logits = self.actor(states[idx])
                dist = Categorical(logits)
                logps = dist.log_prob(actions[idx])
                ratio = torch.exp(logps - old_logps[idx])
                s1 = ratio * advs[idx]
                s2 = torch.clamp(ratio, 1 - CLIP_EPS, 1 + CLIP_EPS) * advs[idx]
                loss_actor = -torch.min(s1, s2).mean()

                vals = self.critic(states[idx]).squeeze()
                loss_critic = nn.MSELoss()(vals, rets[idx])

                self.opt_actor.zero_grad()
                loss_actor.backward()
                self.opt_actor.step()

                self.opt_critic.zero_grad()
                loss_critic.backward()
                self.opt_critic.step()

In [None]:
import gym
import imageio
import torch

def train():
    env = gym.make("CartPole-v1")
    agent = PPO(env.observation_space.shape[0], env.action_space.n)

    for ep in range(1000):
        state, _ = env.reset()
        done = False
        logps, vals, rewards, states, actions, dones = [], [], [], [], [], []
        total = 0

        while not done:
            action, logp = agent.get_action(state)
            value = agent.critic(torch.tensor(state, dtype=torch.float32).unsqueeze(0)).item()
            next_state, reward, done, _, _ = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            logps.append(logp)
            vals.append(value)
            dones.append(done)

            state = next_state
            total += reward

        advs, rets = agent.compute_adv(rewards, vals, dones)
        agent.update(states, actions, logps, advs, rets)

        if ep % 10 == 0:
            print(f"Episode {ep}, Reward: {total}")
        if total >= 475:
            print(f"Solved at episode {ep}")
            break

    env.close()
    return agent

def record_video(agent, path="ppo_cartpole.mp4", max_steps=1000):
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    frames = []
    state, _ = env.reset()
    for _ in range(max_steps):
        frame = env.render()
        frames.append(frame)
        action, _ = agent.get_action(state)
        state, _, done, _, _ = env.step(action)
        if done:
            break
    env.close()
    imageio.mimsave(path, frames, fps=30)
    print(f"Saved video to {path}")

if __name__ == "__main__":
    agent = train()
    record_video(agent)


In [1]:
import gymnasium as gym
import numpy as np
import torch
from torch.distributions.normal import Normal
import pickle
import os

# Constants
ENV_NAME = "HumanoidStandup-v4"
MODEL_PATH = r"D:\Artificial\AI\ppo\human\ppo_humanoidstandup_final.pth"
NORMALIZER_PATH = MODEL_PATH + ".norm"
VIDEO_DIR = "success_videos"
MAX_EPISODE_STEPS = 1000
CONTROL_PENALTY = 0.01  # Same as training

# Make sure the directory exists
os.makedirs(VIDEO_DIR, exist_ok=True)

# Observation Normalizer Class (same as training)
class RunningStat:
    def __init__(self, shape):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.std = np.ones(shape, dtype=np.float32)
        self.count = 0

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_std = np.std(x, axis=0) + 1e-6
        batch_count = x.shape[0]
        self.count += batch_count
        delta = batch_mean - self.mean
        self.mean += delta * batch_count / self.count
        m_a = self.std * self.std * (self.count - batch_count)
        m_b = batch_std * batch_std * batch_count
        self.std = np.sqrt((m_a + m_b + np.square(delta) * self.count * batch_count / self.count) / self.count)

    def normalize(self, x):
        return np.clip((x - self.mean) / self.std, -5, 5)

# Actor Network (must match training)
class Actor(torch.nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(obs_dim, 256),
            torch.nn.LayerNorm(256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 256),
            torch.nn.LayerNorm(256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, act_dim * 2)
        )
        torch.nn.init.orthogonal_(self.net[-1].weight, gain=0.01)
        torch.nn.init.zeros_(self.net[-1].bias)

    def forward(self, state):
        x = self.net(state)
        mean, log_std = x.chunk(2, dim=-1)
        log_std = torch.clamp(log_std, -10, 2)
        return mean, log_std

    def get_action(self, state, deterministic=False):
        mean, log_std = self.forward(state)
        if deterministic:
            return torch.tanh(mean)
        std = log_std.exp()
        dist = Normal(mean, std)
        action = dist.sample()
        return torch.tanh(action)

# Evaluation + Video Recording
def evaluate():
    env = gym.make(ENV_NAME, render_mode="rgb_array")
    env = gym.wrappers.RecordVideo(
        env,
        video_folder=VIDEO_DIR,
        name_prefix="success_video",
        episode_trigger=lambda episode_id: True
    )

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Actor model
    actor = Actor(obs_dim, act_dim).to(device)
    checkpoint = torch.load(MODEL_PATH, map_location=device)
    actor.load_state_dict(checkpoint['actor'])
    actor.eval()

    # Load the full normalizer object
    with open(NORMALIZER_PATH, 'rb') as f:
        normalizer = pickle.load(f)

    state, _ = env.reset()
    episode_reward = 0

    for t in range(MAX_EPISODE_STEPS):
        norm_state = normalizer.normalize(state)
        state_tensor = torch.FloatTensor(norm_state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = actor.get_action(state_tensor, deterministic=True)
        action_np = action.cpu().numpy()[0]

        next_state, reward, terminated, truncated, _ = env.step(action_np)
        reward -= CONTROL_PENALTY * np.sum(action_np ** 2)
        episode_reward += reward
        state = next_state

        if terminated or truncated:
            break

    print(f"🎥 Success video saved — total reward: {episode_reward:.2f}")
    env.close()

if __name__ == "__main__":
    evaluate()


  logger.deprecation(
  logger.warn(


🎥 Success video saved — total reward: 28584.71
