<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/RL_03_02_critic_TD_%EC%98%A4%EC%B0%A8%ED%95%99%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np

# 간단한 Critic 모델 정의
class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(16, activation='relu')
        self.out = tf.keras.layers.Dense(1)  # V(s) 출력

    def call(self, x):
        x = self.d1(x)
        return self.out(x)

# 하이퍼파라미터
gamma = 0.99
critic = Critic()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# 샘플 상태 (예: 4차원 CartPole 상태)
state = np.array([[0.0, 0.1, 0.0, 0.05]], dtype=np.float32)
next_state = np.array([[0.0, 0.2, 0.0, -0.03]], dtype=np.float32)
reward = 1.0
done = False  # 에피소드 종료 여부

# 학습 단계
with tf.GradientTape() as tape:
    v = critic(state)[0, 0]         # 현재 상태 V(s)
    v_next = critic(next_state)[0, 0]  # 다음 상태 V(s')

    # TD Target 계산
    target = reward + (0.0 if done else gamma * v_next)
    td_error = target - v            # TD 오차

    # Critic 손실 = TD 오차 제곱
    critic_loss = td_error**2

# 경사하강법으로 critic 업데이트
grads = tape.gradient(critic_loss, critic.trainable_variables)
optimizer.apply_gradients(zip(grads, critic.trainable_variables))

print(f"V(s)        = {v.numpy():.4f}")
print(f"V(s')       = {v_next.numpy():.4f}")
print(f"Target      = {target.numpy():.4f}")
print(f"TD Error δ  = {td_error.numpy():.4f}")
print(f"Loss (δ²)   = {critic_loss.numpy():.6f}")


V(s)        = -0.0279
V(s')       = -0.0236
Target      = 0.9766
TD Error δ  = 1.0045
Loss (δ²)   = 1.009023
