<a href="https://colab.research.google.com/github/juhumkwon/source_code/blob/main/A(14_1%EA%B0%95)DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random
from collections import deque

# 환경 설정 (간단한 예시, 상태는 1차원, 행동은 3가지)
state_size = 4  # 예: 그리드 월드의 상태 공간 크기
action_size = 3  # 예: 가능한 행동 수 (상, 하, 좌, 우)

# DQN 파라미터
gamma = 0.95  # 할인율
learning_rate = 0.001  # 학습률
epsilon = 1.0  # 탐험 비율
epsilon_min = 0.01  # 탐험 비율의 최소값
epsilon_decay = 0.995  # 탐험 비율 감소값
batch_size = 32  # 학습할 때 사용할 배치 크기
memory_size = 2000  # 리플레이 메모리 크기
train_start = 1000  # 일정 경험을 쌓은 뒤부터 학습 시작

# 리플레이 메모리
memory = deque(maxlen=memory_size)

# DQN 모델 정의
def build_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(24, input_dim=state_size, activation='relu'))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
#   model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate))
    return model

# Q-네트워크
model = build_model()

# 경험 저장 (상태, 행동, 보상, 다음 상태, 종료 여부)
def store_experience(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

# 행동 선택 (epsilon-greedy)
def select_action(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)  # 랜덤 행동 (탐험)
    q_values = model.predict(state)  # Q-값 예측
    return np.argmax(q_values[0])  # Q-값이 가장 큰 행동 선택 (최적 행동)

# DQN 학습
def replay():
    global epsilon
    if len(memory) < train_start:
        return

    # 리플레이 메모리에서 무작위로 배치 샘플링
    minibatch = random.sample(memory, min(batch_size, len(memory)))

    # 학습할 데이터 준비
    for state, action, reward, next_state, done in minibatch:
        target = model.predict(state)

        if done:
            target[0][action] = reward  # 종료 상태인 경우, 보상만 반영
        else:
            # Q(s, a) = r + γ * max Q(s', a')
            t = model.predict(next_state)
            target[0][action] = reward + gamma * np.amax(t[0])

        # 신경망 학습
        model.fit(state, target, epochs=1, verbose=0)

    # 탐험 비율 감소
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# 환경 예시 (임의로 설정)
def get_initial_state():
    return np.reshape([0, 0, 0, 0], [1, state_size])

def get_next_state(state, action):
    # 상태 전환 예시: 상태가 임의로 변한다고 가정 (실제 환경을 사용하면 다르게 구성)
    next_state = state + np.random.randn(1, state_size) * 0.1
    reward = 1 if action == 1 else -1  # 예: 특정 행동에 대해 보상을 부여
    done = np.random.rand() < 0.1  # 10% 확률로 종료
    return next_state, reward, done

# DQN 학습 루프
episodes = 1000

for e in range(episodes):
    state = get_initial_state()  # 초기 상태 설정
    done = False
    while not done:
        action = select_action(state)  # 행동 선택
        next_state, reward, done = get_next_state(state, action)  # 다음 상태, 보상 획득
        store_experience(state, action, reward, next_state, done)  # 경험 저장
        state = next_state  # 상태 업데이트
        replay()  # 학습

    print(f"Episode: {e}/{episodes}, epsilon: {epsilon:.2f}")

print("DQN 학습 완료.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 0/1000, epsilon: 1.00
Episode: 1/1000, epsilon: 1.00
Episode: 2/1000, epsilon: 1.00
Episode: 3/1000, epsilon: 1.00
Episode: 4/1000, epsilon: 1.00
Episode: 5/1000, epsilon: 1.00
Episode: 6/1000, epsilon: 1.00
Episode: 7/1000, epsilon: 1.00
Episode: 8/1000, epsilon: 1.00
Episode: 9/1000, epsilon: 1.00
Episode: 10/1000, epsilon: 1.00
Episode: 11/1000, epsilon: 1.00
Episode: 12/1000, epsilon: 1.00
Episode: 13/1000, epsilon: 1.00
Episode: 14/1000, epsilon: 1.00
Episode: 15/1000, epsilon: 1.00
Episode: 16/1000, epsilon: 1.00
Episode: 17/1000, epsilon: 1.00
Episode: 18/1000, epsilon: 1.00
Episode: 19/1000, epsilon: 1.00
Episode: 20/1000, epsilon: 1.00
Episode: 21/1000, epsilon: 1.00
Episode: 22/1000, epsilon: 1.00
Episode: 23/1000, epsilon: 1.00
Episode: 24/1000, epsilon: 1.00
Episode: 25/1000, epsilon: 1.00
Episode: 26/1000, epsilon: 1.00
Episode: 27/1000, epsilon: 1.00
Episode: 28/1000, epsilon: 1.00
Episode: 29/1000, epsilon: 1.00
Episode: 30/1000, epsilon: 1.00
Episode: 31/1000, 