<a href="https://colab.research.google.com/github/juhumkwon/source_code/blob/main/Deep_Q_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

# 환경 설정: 1차원 격자판 (길이 5), 목표 상태는 4
class Environment:
    def __init__(self):
        self.state_space = 5  # 상태 공간 크기
        self.action_space = 2  # 행동 공간 (0: 왼쪽, 1: 오른쪽)

    def reset(self):
        self.state = 0  # 시작 상태
        return self.state

    def step(self, action):
        if action == 0:  # 왼쪽 이동
            next_state = max(0, self.state - 1)
        else:  # 오른쪽 이동
            next_state = min(self.state_space - 1, self.state + 1)

        reward = 1 if next_state == 4 else 0
        done = next_state == 4
        self.state = next_state
        return next_state, reward, done

# DQN 모델 생성
def build_model(state_space, action_space):
    model = Sequential([
        Dense(24, input_dim=state_space, activation='relu'),
        Dense(24, activation='relu'),
        Dense(action_space, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# DQN 에이전트
class DQNAgent:
    def __init__(self, state_space, action_space):
        self.state_space = state_space
        self.action_space = action_space
        self.model = build_model(state_space, action_space)
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.discount_factor = 0.95
        self.memory = []

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_space)
        q_values = self.model.predict(np.identity(self.state_space)[state:state+1], verbose=0)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        batch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in batch:
            target = reward
            if not done:
                target += self.discount_factor * np.max(self.model.predict(np.identity(self.state_space)[next_state:next_state+1], verbose=0))
            target_f = self.model.predict(np.identity(self.state_space)[state:state+1], verbose=0)
            target_f[0][action] = target
            self.model.fit(np.identity(self.state_space)[state:state+1], target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# 학습 수행
env = Environment()
agent = DQNAgent(state_space=5, action_space=2)

episodes = 100
batch_size = 32
for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    agent.replay(batch_size)
    print(f"Episode: {episode+1}, Total reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

print("\nTraining finished.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 1, Total reward: 1, Epsilon: 1.00
Episode: 2, Total reward: 1, Epsilon: 1.00
Episode: 3, Total reward: 1, Epsilon: 0.99
Episode: 4, Total reward: 1, Epsilon: 0.99
Episode: 5, Total reward: 1, Epsilon: 0.99
Episode: 6, Total reward: 1, Epsilon: 0.98
Episode: 7, Total reward: 1, Epsilon: 0.98
Episode: 8, Total reward: 1, Epsilon: 0.97
Episode: 9, Total reward: 1, Epsilon: 0.97
Episode: 10, Total reward: 1, Epsilon: 0.96
Episode: 11, Total reward: 1, Epsilon: 0.96
Episode: 12, Total reward: 1, Epsilon: 0.95
Episode: 13, Total reward: 1, Epsilon: 0.95
Episode: 14, Total reward: 1, Epsilon: 0.94
Episode: 15, Total reward: 1, Epsilon: 0.94
Episode: 16, Total reward: 1, Epsilon: 0.93
Episode: 17, Total reward: 1, Epsilon: 0.93
Episode: 18, Total reward: 1, Epsilon: 0.92
Episode: 19, Total reward: 1, Epsilon: 0.92
Episode: 20, Total reward: 1, Epsilon: 0.91
Episode: 21, Total reward: 1, Epsilon: 0.91
Episode: 22, Total reward: 1, Epsilon: 0.90
Episode: 23, Total reward: 1, Epsilon: 0.