https://github.com/chrisrryan/AI-Gym_DQN/blob/a9bca0831040be6ab7f41ed003c140f9ca81f173/FrozenLake.py

In [None]:
import gym
import random
import numpy as np
from collections import deque
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers

EPISODES = 1000
GAMMA = 0.95            # γ (gamma) The reward discount factor. Normally between 0.90-0.99). Favours shorter-term rewards
LEARNING_RATE = 0.005   # α (alpha) The learning rate (0.001). Dictates how much existing best known values are impacted by
MEMORY_SIZE = 100000 #1000000
BATCH_SIZE = 20
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [None]:
env = gym.make('FrozenLake-v0', is_slippery=False)
    
# Currently, memory growth needs to be the same across GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

def one_hot(x):
    return np.identity(env.observation_space.n)[x:x + 1].astype(np.float32)

class DQNSolver:
    def __init__(self):
        self.observation_space = env.observation_space.n 
        self.action_space = env.action_space.n
        self.epsilon = EXPLORATION_MAX
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.q_network = self.build_compile() # 큐-네트워크 구성
        self.target_network = self.build_compile() #  타깃 큐-네트워크 구성

        self.target_model() # 가중치를 적용

    def build_compile(self):
        model = keras.Sequential()
        model.add(layers.Dense(24, input_shape=(env.observation_space.n,), activation="relu"))
        model.add(layers.Dense(env.action_space.n, activation="linear"))        
        model.compile(loss="mse", optimizer=optimizers.Adam(lr=LEARNING_RATE))
        return model

    def target_model(self):
        self.target_network.set_weights(self.q_network.get_weights()) #타겟 네트워크에 저장

    def get_target_weights(self):
        return self.target_network.get_weights()

    def epsilon_decay(self):
        if self.epsilon > EXPLORATION_MIN:
            self.epsilon *= EXPLORATION_DECAY

    def remember(self, state, action, reward, new_state, done):        
        self.memory.append((state, action, reward, new_state, done))
        if len(self.memory) > MEMORY_SIZE:
            self.memory.popleft()

    def act(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.randint(0, env.action_space.n)
        else:
            action = np.argmax(self.target_network.predict(one_hot(state)))
        return action

    def q_update(self, state, action, reward, new_state, done):
        if done:
            target = reward            
        else:
            target = reward + GAMMA * np.max(self.target_network.predict(one_hot(new_state)))
            
        target_vector = self.q_network.predict(one_hot(state))[0]
        target_vector[action] = target
        self.q_network.fit(one_hot(state), target_vector.reshape(-1, self.action_space), epochs=1, verbose=0)


    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return

        minibatch = random.sample(self.memory, BATCH_SIZE)
        
        for state, action, reward, new_state, done in minibatch:
            self.q_update(state, action, reward, new_state, done)   
            
    
    def report(self, episode, steps, reward, action, new_state):
        colour = '\033[92m' if reward > 0 else '\033[91m'
        print("Episode: " + str(episode).rjust(4) + '  ε: {:.3f}'.format(self.epsilon) +
              "  Steps: " + str(steps).rjust(3) + f'  Reward: {colour}' + f"{reward:+.1f}" 
              + "  action: " + str(action).rjust(3) + "  new_state: " + str(new_state).rjust(3) +'\033[0m')


In [None]:
dqn_solver = DQNSolver()

print("\n\n" + '\033[92m' + "Begin training OpenAI Gym Frozen Lake" + '\033[0m' + "\n")

for episode in range(EPISODES):
    state = env.reset()
    steps = 0
    done = False
    
    while not done:
        steps += 1
        action = dqn_solver.act(state)
        new_state, reward, done, _ = env.step(action)

        if done and reward < 1 :
            reward = -1.0
        else:
            reward = reward  
        dqn_solver.remember(state, action, reward, new_state, done) 
        #dqn_solver.q_update(state, action, reward, new_state, done) 
        
        state = new_state

        if done:
            dqn_solver.report(episode, steps, reward, action, new_state)
            dqn_solver.epsilon_decay()
            dqn_solver.experience_replay()
        
        steps += 1
        if steps> 100 :
            break
    
    if episode%10 == 0:
        print('-------- target network update -------')
        dqn_solver.target_model()
        

  super(Adam, self).__init__(name, **kwargs)




[92mBegin training OpenAI Gym Frozen Lake[0m

Episode:    0  ε: 1.000  Steps:  11  Reward: [91m-1.0  action:   2  new_state:   7[0m
-------- target network update -------
Episode:    1  ε: 0.995  Steps:   7  Reward: [91m-1.0  action:   2  new_state:   5[0m
Episode:    2  ε: 0.990  Steps:  13  Reward: [91m-1.0  action:   2  new_state:   5[0m
Episode:    3  ε: 0.985  Steps:   7  Reward: [91m-1.0  action:   1  new_state:   5[0m
Episode:    4  ε: 0.980  Steps:   3  Reward: [91m-1.0  action:   2  new_state:   5[0m
Episode:    5  ε: 0.975  Steps:   5  Reward: [91m-1.0  action:   1  new_state:   5[0m
Episode:    6  ε: 0.970  Steps:   7  Reward: [91m-1.0  action:   1  new_state:   5[0m
Episode:    7  ε: 0.966  Steps:   5  Reward: [91m-1.0  action:   1  new_state:   5[0m
Episode:    8  ε: 0.961  Steps:   5  Reward: [91m-1.0  action:   2  new_state:   5[0m
Episode:    9  ε: 0.956  Steps:  11  Reward: [91m-1.0  action:   3  new_state:   5[0m
Episode:   10  ε: 0.951  Steps:

In [None]:
dqn_solver.model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 4)                 100       
                                                                 
Total params: 508
Trainable params: 508
Non-trainable params: 0
_________________________________________________________________
