In [None]:
import gym
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# 하이퍼파라미터 설정
learning_rate = 0.001
discount_factor = 0.99
epsilon = 1.0
epsilon_decay = 0.999
epsilon_min = 0.01
num_episodes = 2000
batch_size = 32
target_update_frequency = 5

# OpenAI Gym 환경 생성
env = gym.make('Taxi-v3')

# 상태(State) 및 행동(Action) 공간의 크기 구하기
state_space_size = env.observation_space.n
action_space_size = env.action_space.n

# DQN 모델 생성
model = Sequential()
model.add(Dense(24, input_shape=(1,), activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(action_space_size, activation='linear'))
model.compile(loss='mse', optimizer=Adam(lr=learning_rate))

# 타깃 네트워크 생성
target_model = Sequential()
target_model.add(Dense(24, input_shape=(1,), activation='relu'))
target_model.add(Dense(24, activation='relu'))
target_model.add(Dense(action_space_size, activation='linear'))
target_model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
target_model.set_weights(model.get_weights())

# 경험 재생 메모리
memory = []

# DQN 모델 업데이트 함수
def update_model():
    if len(memory) < batch_size:
        return
    
    batch = random.sample(memory, batch_size)
    states = np.array([transition[0] for transition in batch])
    actions = np.array([transition[1] for transition in batch])
    rewards = np.array([transition[2] for transition in batch])
    next_states = np.array([transition[3] for transition in batch])
    dones = np.array([transition[4] for transition in batch])

    q_values = model.predict(states)
    next_q_values = target_model.predict(next_states)
    target_q_values = q_values.copy()

    max_next_q_values = np.amax(next_q_values, axis=1)
    target_q_values[np.arange(batch_size), actions] = rewards + discount_factor * max_next_q_values * (1 - dones)

    model.fit(states, target_q_values, epochs=1, verbose=0)

# 타깃 네트워크 업데이트 함수
def update_target_model():
    target_model.set_weights(model.get_weights())

# 학습 진행
for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    total_reward = 0

    while True:
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # 무작위 행동 선택
        else:
            action = np.argmax(model.predict(state)[0])  # DQN을 기반으로 행동 선택

        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 1])
        total_reward += reward

        memory.append((state, action, reward, next_state, done))
        state = next_state

        if done:
            break

        if len(memory) >= batch_size:
            update_model()

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if episode % target_update_frequency == 0:
        update_target_model()

    print("Episode {}: Total Reward = {}, Epsilon = {:.2f}".format(episode + 1, total_reward, epsilon))

    # 예측값 출력
    if episode % 100 == 0:
        q_values = model.predict(state)[0]
        print("Q-Values:", q_values)

env.close()