<a href="https://colab.research.google.com/github/juhumkwon/Defense_Cloud/blob/main/%EC%A0%95%EC%B1%85%EA%B8%B0%EB%B0%98_%EC%98%88%EC%A0%9C(CartPole).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Colab 환경 설정 (1회 실행) ---
!apt-get install -y xvfb > /dev/null 2>&1
!pip install -q pyvirtualdisplay gym
!pip install -q imageio

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()


<pyvirtualdisplay.display.Display at 0x7f09b2ec1590>

In [4]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython.display import HTML
import imageio
import os

# 저장할 디렉토리
video_dir = './video'
os.makedirs(video_dir, exist_ok=True)

# 환경 생성
# Using gymnasium instead of gym for newer API
import gymnasium as gym
env = gym.make('CartPole-v1', render_mode='rgb_array')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# 정책 네트워크 정의
class PolicyNetwork(tf.keras.Model):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = layers.Dense(24, activation='relu')
        self.fc2 = layers.Dense(24, activation='relu')
        self.out = layers.Dense(action_size, activation='softmax')

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        return self.out(x)

policy = PolicyNetwork()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
gamma = 0.99

def compute_returns(rewards, gamma):
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = np.array(returns)
    return (returns - np.mean(returns)) / (np.std(returns) + 1e-9)

# 학습
episodes = 300
reward_list = []

for episode in range(episodes):
    state, info = env.reset()
    done = False
    states, actions, rewards = [], [], []
    total_reward = 0

    while not done:
        state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
        action_probs = policy(state_tensor)
        action = np.random.choice(action_size, p=np.squeeze(action_probs))

        next_state, reward, terminated, truncated, info = env.step(action) # Unpack all 5 values
        done = terminated or truncated # Combine terminated and truncated

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        total_reward += reward
        state = next_state

    returns = compute_returns(rewards, gamma)

    with tf.GradientTape() as tape:
        loss = 0
        for s, a, G in zip(states, actions, returns):
            s_tensor = tf.convert_to_tensor([s], dtype=tf.float32)
            probs = policy(s_tensor)
            log_prob = tf.math.log(probs[0, a] + 1e-9)
            loss += -log_prob * G
        loss /= len(states)

    grads = tape.gradient(loss, policy.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))

    reward_list.append(total_reward)
    if (episode + 1) % 50 == 0:
        print(f"Episode {episode + 1}, Avg Reward: {np.mean(reward_list[-50:]):.2f}")

# ✅ 학습 후 시각화 (영상 저장)
frames = []
state, info = env.reset() # Unpack both values
done = False

while not done:
    frame = env.render()
    frames.append(frame)
    state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)
    action_probs = policy(state_tensor)
    action = np.argmax(np.squeeze(action_probs))
    state, reward, terminated, truncated, info = env.step(action) # Unpack all 5 values
    done = terminated or truncated # Combine terminated and truncated

# mp4로 저장
video_path = os.path.join(video_dir, 'reinforce_cartpole.mp4')
imageio.mimsave(video_path, frames, fps=30)

# Colab에서 재생
from IPython.display import Video
Video(video_path, embed=True)

Episode 50, Avg Reward: 18.80
Episode 100, Avg Reward: 24.18
Episode 150, Avg Reward: 30.08
Episode 200, Avg Reward: 42.40
Episode 250, Avg Reward: 54.00
Episode 300, Avg Reward: 49.64


