<a href="https://colab.research.google.com/github/juhumkwon/source_code/blob/main/SimpleDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import random

# Simple 1D 환경 정의
class Simple1DWorld:
    def __init__(self):
        self.state_size = 5
        self.reset()

    def reset(self):
        self.pos = 2  # 시작 위치는 가운데 (2)
        return self._get_state()

    def _get_state(self):
        state = np.zeros(self.state_size)
        state[self.pos] = 1
        return state.reshape(1, -1)  # 2D 형태로 반환

    def step(self, action):
        if action == 0 and self.pos > 0:
            self.pos -= 1   # 왼쪽 이동
        elif action == 1 and self.pos < self.state_size - 1:
            self.pos += 1   # 오른쪽 이동

        done = self.pos == self.state_size - 1
        reward = 10 if done else 0
        return self._get_state(), reward, done

# Q-Network 모델 정의
def build_model(input_dim, output_dim):
    model = models.Sequential([
        layers.Dense(32, activation='relu', input_shape=(input_dim,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(output_dim)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# DQN 학습 함수
def train_dqn():
    env = Simple1DWorld()
    state_size = env.state_size
    action_size = 2
    q_model = build_model(state_size, action_size)
    target_model = build_model(state_size, action_size)
    target_model.set_weights(q_model.get_weights())

    memory = []
    gamma = 0.95
    epsilon = 1.0
    epsilon_min = 0.01
    epsilon_decay = 0.99
    batch_size = 32
    episodes = 50  # 줄여서 테스트 가능

    for ep in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            # 행동 선택 (ε-greedy)
            if np.random.rand() < epsilon:
                action = np.random.choice(action_size)
            else:
                q_values = q_model.predict(state, verbose=0)
                action = np.argmax(q_values[0])

            # 에이전트 위치와 행동 출력
            print(f"[EP {ep+1}] pos: {env.pos}, action: {action}")

            # 환경 반응
            next_state, reward, done = env.step(action)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            # 학습
            if len(memory) >= batch_size:
                minibatch = random.sample(memory, batch_size)
                states = np.vstack([m[0] for m in minibatch])
                actions = [m[1] for m in minibatch]
                rewards = [m[2] for m in minibatch]
                next_states = np.vstack([m[3] for m in minibatch])
                dones = [m[4] for m in minibatch]

                targets = q_model.predict(states, verbose=0)
                next_qs = target_model.predict(next_states, verbose=0)

                for i in range(batch_size):
                    if dones[i]:
                        targets[i][actions[i]] = rewards[i]
                    else:
                        targets[i][actions[i]] = rewards[i] + gamma * np.max(next_qs[i])

                q_model.fit(states, targets, epochs=1, verbose=0)

        # 타겟 네트워크 업데이트
        if ep % 10 == 0:
            target_model.set_weights(q_model.get_weights())

        # ε 감소
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        print(f"Episode {ep+1}: Total reward = {total_reward}, Epsilon = {epsilon:.3f}")

# 실행
if __name__ == '__main__':
    train_dqn()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[EP 1] pos: 2, action: 0
[EP 1] pos: 1, action: 0
[EP 1] pos: 0, action: 1
[EP 1] pos: 1, action: 1
[EP 1] pos: 2, action: 0
[EP 1] pos: 1, action: 1
[EP 1] pos: 2, action: 0
[EP 1] pos: 1, action: 0
[EP 1] pos: 0, action: 0
[EP 1] pos: 0, action: 0
[EP 1] pos: 0, action: 1
[EP 1] pos: 1, action: 1
[EP 1] pos: 2, action: 1
[EP 1] pos: 3, action: 1
Episode 1: Total reward = 10, Epsilon = 0.990
[EP 2] pos: 2, action: 0
[EP 2] pos: 1, action: 1
[EP 2] pos: 2, action: 0
[EP 2] pos: 1, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 1
[EP 2] pos: 1, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 0
[EP 2] pos: 0, action: 1
[EP 2] pos: 1, action: 1
[EP 2] pos: 2, action: 1
[EP 2] pos: 3, action: 0
[EP 2] pos: 2, action: 1
[EP 2] pos: 3, action: 1
Episode 2: Total reward = 10, Epsilon = 0.980
[EP 3] pos: 2, action: 1
[EP 3] pos: 3, action: 0
[EP 3] pos: 2, action: 1
[EP 3] pos: 3, action: 0
[EP 3] p