<a href="https://colab.research.google.com/github/juhumkwon/Data/blob/main/DQN_%EC%98%88%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from collections import deque
import random

# 하이퍼파라미터 설정
ENV_NAME = "CartPole-v1"
GAMMA = 0.99
LEARNING_RATE = 0.001
EPSILON = 1.0
EPSILON_DECAY = 0.995
MIN_EPSILON = 0.01
BATCH_SIZE = 64
MEMORY_SIZE = 2000
TARGET_UPDATE_FREQ = 10
EPISODES = 500

# Replay Memory 클래스 정의
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Q-네트워크 모델 생성
def create_q_network(state_size, action_size):
    model = Sequential([
        Dense(24, input_dim=state_size, activation='relu'),
        Dense(24, activation='relu'),
        Dense(action_size, activation='linear')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='mse')
    return model

# DQN 에이전트
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayMemory(MEMORY_SIZE)
        self.epsilon = EPSILON
        self.model = create_q_network(state_size, action_size)
        self.target_model = create_q_network(state_size, action_size)
        self.update_target_network()

    def update_target_network(self):
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def train(self):
        if len(self.memory) < BATCH_SIZE:
            return

        minibatch = self.memory.sample(BATCH_SIZE)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state, verbose=0)
            if done:
                target[0][action] = reward
            else:
                t = self.target_model.predict(next_state, verbose=0)
                target[0][action] = reward + GAMMA * np.amax(t[0])
            self.model.fit(state, target, epochs=1, verbose=0)

        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY

# 환경 및 학습 실행
env = gym.make(ENV_NAME)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(state_size, action_size)

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.memory.add(state, action, reward, next_state, done)
        state = next_state

        if done:
            print(f"Episode: {e+1}/{EPISODES}, Score: {total_reward}, Epsilon: {agent.epsilon:.2f}")
            break

        agent.train()

    if e % TARGET_UPDATE_FREQ == 0:
        agent.update_target_network()

env.close()

  from jax import xla_computation as _xla_computation
  deprecation(
  deprecation(
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1/500, Score: 30.0, Epsilon: 1.00
Episode: 2/500, Score: 11.0, Epsilon: 1.00
Episode: 3/500, Score: 15.0, Epsilon: 1.00
Episode: 4/500, Score: 14.0, Epsilon: 0.97
Episode: 5/500, Score: 16.0, Epsilon: 0.90
Episode: 6/500, Score: 20.0, Epsilon: 0.82


KeyboardInterrupt: 