<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/RL_02_(12)_%EC%A0%95%EC%B1%85%EA%B8%B0%EB%B0%98_vs_%EA%B0%80%EC%B9%98%EA%B8%B0%EB%B0%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📌 라이브러리 설치 (코랩에서 처음 실행 시 필요)
!pip install gymnasium gym-notebook-wrapper tensorflow matplotlib --quiet

import gymnasium as gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# -----------------------
# 정책기반 (REINFORCE)
# -----------------------
class PolicyNet(tf.keras.Model):
    def __init__(self, action_size):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(24, activation='relu')
        self.d2 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_size, activation='softmax')

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.out(x)

class REINFORCEAgent:
    def __init__(self, action_size, lr=0.01, gamma=0.99):
        self.action_size = action_size
        self.gamma = gamma
        self.model = PolicyNet(action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr)
        self.states, self.actions, self.rewards = [], [], []

    def get_action(self, state):
        state = state.reshape([1, -1])
        probs = self.model(state)
        action = np.random.choice(self.action_size, p=probs.numpy()[0])
        return action

    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def train(self):
        returns, G = [], 0
        for r in reversed(self.rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + 1e-8)

        with tf.GradientTape() as tape:
            loss = 0
            for state, action, Gt in zip(self.states, self.actions, returns):
                state = state.reshape([1, -1])
                probs = self.model(state)
                action_prob = probs[0, action]
                loss += -tf.math.log(action_prob) * Gt
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        self.states, self.actions, self.rewards = [], [], []

# -----------------------
# 가치기반 (DQN)
# -----------------------
class QNet(tf.keras.Model):
    def __init__(self, action_size):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(24, activation='relu')
        self.d2 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_size)

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.out(x)

class DQNAgent:
    def __init__(self, action_size, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model = QNet(action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr)
        self.memory = []

    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model(state.reshape([1, -1]))
        return np.argmax(q_values.numpy()[0])

    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > 2000:
            self.memory.pop(0)

    def train(self, batch_size=32):
        if len(self.memory) < batch_size:
            return
        mini_batch = np.random.choice(len(self.memory), batch_size, replace=False)
        states, targets = [], []
        for i in mini_batch:
            s, a, r, s_next, done = self.memory[i]
            target = r
            if not done:
                target += self.gamma * np.max(self.model(s_next.reshape([1, -1]))[0].numpy())
            target_f = self.model(s.reshape([1, -1])).numpy()
            target_f[0][a] = target
            states.append(s)
            targets.append(target_f[0])

        with tf.GradientTape() as tape:
            q_preds = self.model(np.array(states))
            loss = tf.keras.losses.MSE(np.array(targets), q_preds)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# -----------------------
# 학습 실행
# -----------------------
EPISODES = 300
env = gym.make("CartPole-v1")

reinforce_agent = REINFORCEAgent(action_size=env.action_space.n)
dqn_agent = DQNAgent(action_size=env.action_space.n)

reinforce_scores = []
dqn_scores = []

for ep in range(EPISODES):
    # REINFORCE
    state = env.reset()[0]
    done, score = False, 0
    while not done:
        action = reinforce_agent.get_action(state)
        next_state, reward, done, _, _ = env.step(action)
        reinforce_agent.store(state, action, reward)
        state = next_state
        score += reward
    reinforce_agent.train()
    reinforce_scores.append(score)

    # DQN
    state = env.reset()[0]
    done, score = False, 0
    while not done:
        action = dqn_agent.get_action(state)
        next_state, reward, done, _, _ = env.step(action)
        dqn_agent.store(state, action, reward, next_state, done)
        state = next_state
        score += reward
    dqn_agent.train()
    dqn_scores.append(score)

    if (ep+1) % 20 == 0:
        print(f"Episode {ep+1}: REINFORCE Avg={np.mean(reinforce_scores[-20:]):.1f}, "
              f"DQN Avg={np.mean(dqn_scores[-20:]):.1f}")

# -----------------------
# 그래프 시각화
# -----------------------
def smooth(data, window=10):
    return np.convolve(data, np.ones(window)/window, mode='valid')

plt.figure(figsize=(10,5))
plt.plot(smooth(reinforce_scores), label="Policy-based (REINFORCE)")
plt.plot(smooth(dqn_scores), label="Value-based (DQN)")
plt.xlabel("Episode")
plt.ylabel("Score")
plt.legend()
plt.title("Policy-based vs Value-based Performance on CartPole")
plt.show()

Episode 20: REINFORCE Avg=19.9, DQN Avg=20.7
Episode 40: REINFORCE Avg=66.0, DQN Avg=19.8
Episode 60: REINFORCE Avg=116.8, DQN Avg=18.4
Episode 80: REINFORCE Avg=328.8, DQN Avg=19.1
Episode 100: REINFORCE Avg=197.8, DQN Avg=15.9
