<a href="https://colab.research.google.com/github/juhumkwon/Defense_Cloud/blob/main/DQN_%EA%B8%B0%EB%B0%98_FrozenLake_%EC%98%88%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import tensorflow as tf
import random
from collections import deque

# 환경 설정
env = gym.make("FrozenLake-v1", is_slippery=False)
num_states = env.observation_space.n
num_actions = env.action_space.n

def one_hot(state, size):
    vec = np.zeros(size, dtype=np.float32)
    vec[state] = 1.0
    return vec

# Q 네트워크 모델
def build_model():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(num_states,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(num_actions)
    ])

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

# 하이퍼파라미터
lr = 0.01
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995
episodes = 3000
batch_size = 32
buffer = deque(maxlen=10000)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss_fn = tf.keras.losses.MeanSquaredError()

# ε-greedy
def epsilon_greedy(q_vals, eps):
    if random.random() < eps:
        return random.randint(0, num_actions - 1)
    return np.argmax(q_vals)

# 학습
for ep in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0

    while not done:
        s_vec = one_hot(state, num_states).reshape(1, -1)
        q_vals = model(s_vec, training=False).numpy()[0]
        action = epsilon_greedy(q_vals, epsilon)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        buffer.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # 학습
        if len(buffer) >= batch_size:
            minibatch = random.sample(buffer, batch_size)
            states = np.array([one_hot(s, num_states) for s, _, _, _, _ in minibatch])
            next_states = np.array([one_hot(s_, num_states) for _, _, _, s_, _ in minibatch])
            q_targets = model(states).numpy()
            next_qs = target_model(next_states).numpy()

            for i, (s, a, r, s_, d) in enumerate(minibatch):
                q_targets[i][a] = r if d else r + gamma * np.max(next_qs[i])

            with tf.GradientTape() as tape:
                q_preds = model(states, training=True)
                loss = loss_fn(q_targets, q_preds)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # 타겟 네트워크 갱신
    if ep % 20 == 0:
        target_model.set_weights(model.get_weights())

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if (ep + 1) % 500 == 0:
        print(f"Episode {ep + 1}, epsilon: {epsilon:.3f}, reward: {total_reward}")


Episode 500, epsilon: 0.100, reward: 0.0
Episode 1000, epsilon: 0.100, reward: 0.0
Episode 1500, epsilon: 0.100, reward: 0.0
Episode 2000, epsilon: 0.100, reward: 0.0
