<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/Actor_Critic_(CartPole_%EC%98%88%EC%A0%9C).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# --- Colab 환경 설정 (1회 실행) ---
!apt-get install -y xvfb > /dev/null 2>&1
!pip install -q pyvirtualdisplay gym
!pip install -q imageio
!pip install -U tensorflow
!pip install gymnasium
!pip install "gymnasium[classic-control]"

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorf

<pyvirtualdisplay.display.Display at 0x78931fe59890>

In [None]:
import tensorflow as tf
import numpy as np
import gymnasium as gym  # 최신 gym 버전 (gymnasium) 사용 권장
import matplotlib.pyplot as plt
import imageio
import os
from IPython.display import Video

# 영상 저장 디렉토리
video_dir = './video'
os.makedirs(video_dir, exist_ok=True)

# 환경 설정 (render_mode 추가)
env = gym.make("CartPole-v1", render_mode='rgb_array')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# 정책 신경망 (Actor)
class Actor(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_size)

    def call(self, x):
        x = self.d1(x)
        return tf.nn.softmax(self.out(x))

# 가치 신경망 (Critic)
class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(1)

    def call(self, x):
        x = self.d1(x)
        return self.out(x)

actor = Actor()
critic = Critic()
actor_optimizer = tf.keras.optimizers.Adam(0.001)
critic_optimizer = tf.keras.optimizers.Adam(0.002)

def get_action(state):
    state = tf.convert_to_tensor([state], dtype=tf.float32)
    probs = actor(state)
    action = tf.random.categorical(tf.math.log(probs), 1)
    return int(action[0, 0]), probs[0]

def train_step(state, action, reward, next_state, terminated, truncated):
    state = tf.convert_to_tensor([state], dtype=tf.float32)
    next_state = tf.convert_to_tensor([next_state], dtype=tf.float32)

    with tf.GradientTape(persistent=True) as tape:
        v = critic(state)[0, 0]
        v_next = critic(next_state)[0, 0]
        target = reward + (1 - int(terminated or truncated)) * 0.99 * v_next
        td_error = target - v

        critic_loss = td_error**2

        probs = actor(state)
        log_prob = tf.math.log(probs[0, action] + 1e-8)
        actor_loss = -log_prob * td_error

    actor_grads = tape.gradient(actor_loss, actor.trainable_variables)
    critic_grads = tape.gradient(critic_loss, critic.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))
    critic_optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))
    del tape

# 학습 루프
episodes = 300
scores = []

for ep in range(episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _ = get_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        train_step(state, action, reward, next_state, terminated, truncated)
        state = next_state
        total_reward += reward
        done = terminated or truncated
    scores.append(total_reward)
    if (ep + 1) % 50 == 0:
        print(f"Episode {ep+1}: Total Reward = {total_reward}")

# 학습 결과 시각화
plt.plot(scores)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Actor-Critic: CartPole-v1")
plt.grid()
plt.show()

# ===========================
# 학습된 정책으로 테스트 후 영상 저장
# ===========================
frames = []
state, _ = env.reset()
done = False

while not done:
    frames.append(env.render())
    state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
    action_probs = actor(state_tensor).numpy()[0]
    action = np.argmax(action_probs)
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

video_path = os.path.join(video_dir, 'actor_critic_cartpole.mp4')

# Use imageio.get_writer to save the video
with imageio.get_writer(video_path, fps=30) as writer:
    for frame in frames:
        writer.append_data(frame)

# 노트북 내에서 영상 출력
Video(video_path, embed=True)

Episode 50: Total Reward = 30.0
Episode 100: Total Reward = 55.0
Episode 150: Total Reward = 65.0
Episode 200: Total Reward = 110.0
Episode 250: Total Reward = 73.0


In [9]:
!pip install "gymnasium[classic-control]"

Collecting pygame>=2.1.3 (from gymnasium[classic-control])
  Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.6.1
