<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/RL_02_04_reinforce_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Colab 환경 설정 (1회 실행) ---
!apt-get install -y xvfb > /dev/null 2>&1
!pip install -q pyvirtualdisplay gym
!pip install -q imageio

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7c200df0b690>

In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import imageio
import os
from IPython.display import Video

# ==========================
# 설정
# ==========================
video_dir = './video'
os.makedirs(video_dir, exist_ok=True)

env = gym.make('CartPole-v1', render_mode='rgb_array')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# ==========================
# 정책 신경망 정의
# ==========================
class PolicyNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.fc1 = layers.Dense(24, activation='relu')
        self.fc2 = layers.Dense(24, activation='relu')
        self.out = layers.Dense(action_size, activation='softmax')

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return self.out(x)

policy = PolicyNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
gamma = 0.99

# ==========================
# 리턴(누적 보상) 계산
# ==========================
def compute_returns(rewards, gamma):
    G = 0
    returns = []
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = np.array(returns)
    return (returns - np.mean(returns)) / (np.std(returns) + 1e-9)

# ==========================
# 학습
# ==========================
episodes = 300
reward_list = []

for episode in range(episodes):
    state, _ = env.reset()
    done = False
    states, actions, rewards = [], [], []
    total_reward = 0

    # ---- 1. 한 에피소드 수집 ----
    while not done:
        state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
        action_probs = policy(state_tensor).numpy()[0]
        action = np.random.choice(action_size, p=action_probs)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        total_reward += reward
        state = next_state

    # ---- 2. 에피소드 리턴 계산 ----
    returns = compute_returns(rewards, gamma)

    # ---- 3. 에피소드 전체에 대한 정책 그래디언트 손실 계산 ----
    with tf.GradientTape() as tape:
        state_tensor = tf.convert_to_tensor(np.vstack(states), dtype=tf.float32)
        action_probs = policy(state_tensor)
        action_masks = tf.one_hot(actions, action_size)
        log_probs = tf.reduce_sum(action_masks * tf.math.log(action_probs + 1e-9), axis=1)
        loss = -tf.reduce_mean(log_probs * returns)

    grads = tape.gradient(loss, policy.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))

    reward_list.append(total_reward)
    if (episode + 1) % 50 == 0:
        print(f"Episode {episode + 1}, Avg Reward (last 50): {np.mean(reward_list[-50:]):.2f}")

# ==========================
# 학습 후 영상 생성
# ==========================
frames = []
state, _ = env.reset()
done = False

while not done:
    frames.append(env.render())
    state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
    action = np.argmax(policy(state_tensor).numpy()[0])
    state, _, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

video_path = os.path.join(video_dir, 'reinforce_cartpole.mp4')
imageio.mimsave(video_path, frames, fps=30)

Video(video_path, embed=True)


Episode 50, Avg Reward (last 50): 22.14
Episode 100, Avg Reward (last 50): 29.12
Episode 150, Avg Reward (last 50): 44.34
Episode 200, Avg Reward (last 50): 45.06
Episode 250, Avg Reward (last 50): 53.70
Episode 300, Avg Reward (last 50): 66.68


