<a href="https://colab.research.google.com/github/juhumkwon/DeepLearning/blob/main/RL_02_03_reinforce_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import gym # Deprecated
import gymnasium as gym # Use gymnasium instead of gym
import numpy as np
import tensorflow as tf

# ----------------------------
# 환경 설정
# ----------------------------
# env = gym.make("CartPole-v1") # Old API
env = gym.make("CartPole-v1") # Removed new_step_api=True
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# ----------------------------
# 정책 신경망 정의
# ----------------------------
class PolicyNet(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(24, activation='relu')
        self.d2 = tf.keras.layers.Dense(24, activation='relu')
        self.logits = tf.keras.layers.Dense(action_dim)  # 로짓 출력

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.logits(x)

policy = PolicyNet()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# ----------------------------
# 액션 선택 함수
# ----------------------------
def select_action(state):
    state = tf.expand_dims(state, 0)  # 배치 차원 추가
    logits = policy(state)
    action_prob = tf.nn.softmax(logits)
    action = tf.random.categorical(tf.math.log(action_prob), 1)[0,0].numpy()
    return action, action_prob[0]

# ----------------------------
# 에피소드 실행 및 학습
# ----------------------------
gamma = 0.99  # 감가율

for episode in range(3):  # 짧은 예시용
    state, info = env.reset() # Gymnasium reset returns state and info
    done = False
    states, actions, rewards = [], [], []

    while not done:
        action, action_prob = select_action(state)
        # next_state, reward, done, _ = env.step(action) # Old API
        next_state, reward, terminated, truncated, info = env.step(action) # Gymnasium step returns 5 values
        done = terminated or truncated # done is now the logical OR of terminated and truncated

        # 데이터 기록
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    # ----------------------------
    # 에피소드 종료 후 REINFORCE 업데이트
    # ----------------------------
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = np.array(returns, dtype=np.float32)
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)  # 정규화

    with tf.GradientTape() as tape:
        loss = 0
        for s, a, Gt in zip(states, actions, returns):
            s = tf.expand_dims(s, 0)
            logits = policy(s)
            pi = tf.nn.softmax(logits)[0]
            # 로짓 기준 로그 확률 그라디언트
            log_prob = tf.math.log(tf.reduce_sum(pi * tf.one_hot(a, action_dim)))
            loss += -log_prob * Gt  # gradient ascent -> minimize -log_prob*Gt

    grads = tape.gradient(loss, policy.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))

    print(f"Episode {episode+1}: total reward = {sum(rewards)}")

Episode 1: total reward = 22.0
Episode 2: total reward = 15.0
Episode 3: total reward = 96.0
