<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/RL_03_03_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# PPO (TensorFlow 2.x) for CartPole-v1
# - GAE(Generalized Advantage Estimation)
# - clipped surrogate objective (epsilon=0.2)
# - value loss + entropy bonus
# - multi-epoch updates with minibatches
# - gymnasium 또는 gym 자동 호환

import os, sys, math, random
import numpy as np
import tensorflow as tf

# ===== Gym / Gymnasium 호환 =====
try:
    import gymnasium as gym
    GYMN = True
except Exception:
    import gym
    GYMN = False

print("Using:", "gymnasium" if GYMN else "gym")
ENV_ID = "CartPole-v1"

# ===== 하이퍼파라미터 =====
seed               = 42
total_timesteps    = 200_000
steps_per_rollout  = 2048
gamma              = 0.99
gae_lambda         = 0.95
learning_rate      = 3e-4
clip_eps           = 0.2
value_coef         = 0.5
entropy_coef       = 0.01
update_epochs      = 10
minibatch_size     = 64
max_grad_norm      = 0.5
eval_interval      = 10_000   # 평가 주기(스텝)
render_eval        = False

# ===== 시드 고정 =====
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

# ===== 환경 생성 =====
if GYMN:
    env = gym.make(ENV_ID)
    eval_env = gym.make(ENV_ID)
    obs, info = env.reset(seed=seed)
else:
    env = gym.make(ENV_ID)
    eval_env = gym.make(ENV_ID)
    env.seed(seed)
    eval_env.seed(seed)
    obs = env.reset()

obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

# ===== 네트워크 정의 (공유 본체 + 정책/가치 헤드) =====
class ActorCritic(tf.keras.Model):
    def __init__(self, n_actions):
        super().__init__()
        # 공유 본체
        self.fc1 = tf.keras.layers.Dense(128, activation='tanh')
        self.fc2 = tf.keras.layers.Dense(128, activation='tanh')
        # 정책/가치 헤드
        self.logits = tf.keras.layers.Dense(n_actions, activation=None)
        self.value  = tf.keras.layers.Dense(1, activation=None)

    def call(self, x):
        x = tf.convert_to_tensor(x, dtype=tf.float32)
        h = self.fc1(x)
        h = self.fc2(h)
        return self.logits(h), tf.squeeze(self.value(h), axis=-1)

policy = ActorCritic(n_actions)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# ===== Rollout 버퍼 =====
class RolloutBuffer:
    def __init__(self, size, obs_dim):
        self.size = size
        self.obs = np.zeros((size, obs_dim), dtype=np.float32)
        self.actions = np.zeros((size,), dtype=np.int32)
        self.logprobs = np.zeros((size,), dtype=np.float32)
        self.rewards = np.zeros((size,), dtype=np.float32)
        self.dones = np.zeros((size,), dtype=np.float32)
        self.values = np.zeros((size,), dtype=np.float32)
        # post-compute
        self.advantages = np.zeros((size,), dtype=np.float32)
        self.returns = np.zeros((size,), dtype=np.float32)
        self.ptr = 0

    def add(self, o, a, lp, r, d, v):
        self.obs[self.ptr] = o
        self.actions[self.ptr] = a
        self.logprobs[self.ptr] = lp
        self.rewards[self.ptr] = r
        self.dones[self.ptr] = d
        self.values[self.ptr] = v
        self.ptr += 1

    def full(self):
        return self.ptr >= self.size

    def reset(self):
        self.ptr = 0

buffer = RolloutBuffer(steps_per_rollout, obs_dim)

# ===== 정책으로 행동 샘플링/로그확률/가치 =====
@tf.function
def policy_step(obs_batch):
    logits, value = policy(obs_batch)
    action_dist = tf.random.categorical(logits, num_samples=1)  # [B,1]
    action = tf.squeeze(action_dist, axis=1)
    # 로그확률 계산 (categorical)
    logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=action, logits=logits)
    return action, logprob, value

def get_action_and_logprob_value(obs):
    # 단일 관측치 버전
    logits, value = policy(obs[None, :])
    action = tf.random.categorical(logits, 1)
    action = int(action.numpy().squeeze())
    # 로그확률
    logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=[action], logits=logits
    ).numpy().squeeze()
    return action, float(logprob), float(value.numpy().squeeze())

# ===== GAE 계산 =====
def compute_gae(rewards, dones, values, last_value, gamma=0.99, lam=0.95):
    T = len(rewards)
    advantages = np.zeros_like(rewards, dtype=np.float32)
    gae = 0.0
    for t in reversed(range(T)):
        next_value = last_value if t == T-1 else values[t+1]
        delta = rewards[t] + gamma * (1.0 - dones[t]) * next_value - values[t]
        gae = delta + gamma * lam * (1.0 - dones[t]) * gae
        advantages[t] = gae
    returns = advantages + values
    return advantages, returns

# ===== PPO 업데이트 스텝 =====
@tf.function
def ppo_update(obs, actions, old_logprobs, returns, advantages):
    # 표준화된 advantage
    adv_mean = tf.reduce_mean(advantages)
    adv_std = tf.math.reduce_std(advantages) + 1e-8
    norm_adv = (advantages - adv_mean) / adv_std

    with tf.GradientTape() as tape:
        logits, values = policy(obs)
        # 새 로그확률
        new_logprobs = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=tf.cast(actions, tf.int32), logits=logits
        )
        # ratio = exp(new - old)
        ratio = tf.exp(new_logprobs - old_logprobs)

        # policy loss (clipped)
        unclipped = ratio * norm_adv
        clipped = tf.clip_by_value(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * norm_adv
        policy_loss = -tf.reduce_mean(tf.minimum(unclipped, clipped))

        # value loss (MSE)
        value_loss = tf.reduce_mean(tf.square(returns - values)) * value_coef

        # entropy bonus
        # -sum p*logp = categorical entropy
        probs = tf.nn.softmax(logits)
        entropy = -tf.reduce_mean(tf.reduce_sum(probs * tf.math.log(tf.clip_by_value(probs, 1e-8, 1.0)), axis=1))
        entropy_loss = -entropy_coef * entropy  # (minus because we minimize total loss)

        total_loss = policy_loss + value_loss + entropy_loss

    grads = tape.gradient(total_loss, policy.trainable_variables)
    # gradient clipping
    grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))
    return policy_loss, value_loss, entropy, tf.reduce_mean(ratio)

# ===== 평가 함수 =====
def evaluate(env, policy, episodes=5, render=False):
    scores = []
    for _ in range(episodes):
        if GYMN:
            obs, info = env.reset(seed=np.random.randint(1e9))
        else:
            obs = env.reset()
        done = False
        ep_ret = 0
        while not done:
            logits, _ = policy(np.array(obs)[None, :])
            action = int(tf.argmax(logits, axis=-1).numpy().squeeze())
            if GYMN:
                next_obs, reward, term, trunc, info = env.step(action)
                done = bool(term or trunc)
            else:
                next_obs, reward, done, info = env.step(action)
            ep_ret += reward
            obs = next_obs
            if render:
                env.render()
        scores.append(ep_ret)
    return np.mean(scores), np.std(scores)

# ===== 학습 루프 =====
global_step = 0
best_score = -np.inf

if GYMN:
    obs, info = env.reset(seed=seed)
else:
    obs = env.reset()

episode_return = 0
episode_len = 0

print("Start training PPO on", ENV_ID)
while global_step < total_timesteps:
    buffer.reset()
    # ---- Rollout 수집 ----
    for t in range(steps_per_rollout):
        action, logprob, value = get_action_and_logprob_value(np.array(obs, dtype=np.float32))
        if GYMN:
            next_obs, reward, term, trunc, info = env.step(action)
            done = bool(term or trunc)
        else:
            next_obs, reward, done, info = env.step(action)

        buffer.add(np.array(obs, dtype=np.float32), action, logprob, reward, float(done), value)

        obs = next_obs
        episode_return += reward
        episode_len += 1
        global_step += 1

        if done:
            if GYMN:
                obs, info = env.reset()
            else:
                obs = env.reset()
            episode_return = 0
            episode_len = 0

        if global_step % eval_interval == 0:
            mean_score, std_score = evaluate(eval_env, policy, episodes=5, render=render_eval)
            print(f"[Step {global_step}] Eval: {mean_score:.1f} ± {std_score:.1f}")

    # ---- 부트스트랩 값 ----
    with tf.device("/CPU:0"):
        # 마지막 next value
        logits, last_v = policy(np.array(obs, dtype=np.float32)[None, :])
        last_value = float(last_v.numpy().squeeze())

    # ---- GAE/리턴 계산 ----
    adv, ret = compute_gae(buffer.rewards, buffer.dones, buffer.values, last_value,
                           gamma=gamma, lam=gae_lambda)
    buffer.advantages = adv
    buffer.returns = ret

    # ---- 미니배치 학습 ----
    idxs = np.arange(steps_per_rollout)
    for epoch in range(update_epochs):
        np.random.shuffle(idxs)
        for start in range(0, steps_per_rollout, minibatch_size):
            end = start + minibatch_size
            mb_idx = idxs[start:end]

            mb_obs = tf.convert_to_tensor(buffer.obs[mb_idx], dtype=tf.float32)
            mb_actions = tf.convert_to_tensor(buffer.actions[mb_idx], dtype=tf.int32)
            mb_old_logprobs = tf.convert_to_tensor(buffer.logprobs[mb_idx], dtype=tf.float32)
            mb_returns = tf.convert_to_tensor(buffer.returns[mb_idx], dtype=tf.float32)
            mb_advs = tf.convert_to_tensor(buffer.advantages[mb_idx], dtype=tf.float32)

            pl, vl, ent, rat = ppo_update(mb_obs, mb_actions, mb_old_logprobs, mb_returns, mb_advs)

    # ---- 간단한 훈련 로그 ----
    mean_adv = float(np.mean(buffer.advantages))
    std_adv = float(np.std(buffer.advantages) + 1e-8)
    print(f"Step {global_step:7d} | loss_pi: {pl.numpy():.4f}  loss_v: {vl.numpy():.4f}  "
          f"entropy: {ent.numpy():.4f}  ratio: {rat.numpy():.3f}  adv μ/σ: {mean_adv:.3f}/{std_adv:.3f}")

# 최종 평가
final_mean, final_std = evaluate(eval_env, policy, episodes=10, render=False)
print(f"Training finished. Final Eval: {final_mean:.1f} ± {final_std:.1f}")

# 모델 저장 (원하면 불러오기: tf.keras.models.save_model / tf.keras.models.load_model)
save_path = "ppo_cartpole_actor_critic"
policy.save_weights(save_path)
print("Saved weights to:", save_path)

Using: gymnasium
Start training PPO on CartPole-v1
Step    2048 | loss_pi: -0.0175  loss_v: 6.7264  entropy: 0.6587  ratio: 1.007  adv μ/σ: 10.196/4.549
Step    4096 | loss_pi: -0.0408  loss_v: 21.2399  entropy: 0.6525  ratio: 0.981  adv μ/σ: 7.045/6.874
Step    6144 | loss_pi: -0.0014  loss_v: 17.7267  entropy: 0.6026  ratio: 0.998  adv μ/σ: 6.855/7.602
Step    8192 | loss_pi: -0.0246  loss_v: 28.3640  entropy: 0.6322  ratio: 0.968  adv μ/σ: 7.766/7.083
[Step 10000] Eval: 196.4 ± 13.6
Step   10240 | loss_pi: 0.0176  loss_v: 26.7332  entropy: 0.5668  ratio: 1.050  adv μ/σ: 4.616/9.070
Step   12288 | loss_pi: -0.0036  loss_v: 48.9482  entropy: 0.5636  ratio: 0.972  adv μ/σ: 3.252/9.517
Step   14336 | loss_pi: 0.0433  loss_v: 1.9091  entropy: 0.6174  ratio: 0.936  adv μ/σ: 4.645/5.593
Step   16384 | loss_pi: 0.1060  loss_v: 91.7638  entropy: 0.4015  ratio: 1.181  adv μ/σ: -11.737/17.225
Step   18432 | loss_pi: 0.0509  loss_v: 1.2866  entropy: 0.6402  ratio: 1.014  adv μ/σ: 5.958/4.519
[S