In [None]:
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import random
import gym
import time
from collections import deque

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
class DQN(Agent):
    def __init__(self, env):
        super().__init__(env)
        self._name_arg = ['greedy', 'noise', 'learning_rate', 'discount']
        self._init_setting = [False, False, 0, 1.0]
        self.__optimizer = None
        self.__input_size = self.env.observation_space.n


        self.__h_size = 10


        self.__output_size = self.env.action_space.n


        self.__weight1 = None
        self.__weight2 = None
        self.__deque_maxlen = 2000
        self.__num_episodes = 20000
        self.__random_sample = 50
        self.__num_sample = 50
        self.__num_choice = 10


    def get_weight(self):
        return np.array(self.__weight2.numpy()).tolist()

    def run(self, num_episodes, q_map = None, early_stopping=False, **kwargs):
        del self._log_epi
        self._log_epi = []
        if early_stopping:
            early_stopping.clear()

        learning_rate = 0.1


        #weight
        self.__weight1 = tf.Variable(tf.random.uniform([self.__input_size, self.__h_size], 0, 0.01), dtype=tf.float32)
        self.__weight2 = tf.Variable(tf.random.uniform([self.__h_size, self.__output_size], 0, 0.01), dtype=tf.float32)


        #optimizer
        self.__optimizer = optimizer = tf.optimizers.SGD(learning_rate=learning_rate)
        

        start_time = time.time()
        q_map = self._load_map(q_map)
        for idx in range(num_episodes):
            self._run_episodes(q_map, setting=kwargs)

            num_ = self._print_progress(idx, num_episodes)
            if self._check_early_stopping(early_stopping):
                print(f'progress = {num_} %  --> {idx}/{num_episodes} Early Stopping')
                break
        sum_reward_by_epi = self._get_log_sum_reward()
        print(f'{(time.time() - start_time)} seconds')
        return q_map, sum_reward_by_epi

    # def __run_episodes(self, q_map, idx=0, greedy=False, noise=False, learning_rate=0, discount=1.):
    def _run_episodes(self, q_map, idx=0, setting=None):
        greedy, noise, learning_rate, discount = self._get_setting(setting)
        # 시작 state 설정
        state = self.env.reset()
        done = False
        local_loss = []
        rList = []


        save_weight = []
        buffer = deque(maxlen=self.__deque_maxlen)
        

        log_step = [[] for _ in range(4)]
        while not done:
            # Choose an action by greedly (with a chance of random action)
            # from the Q-network


            #q_value = self.__dense_activation1(state)
            #q_value = self.__dense_activation2(state)
            #q_value = np.array(q_value.numpy())
            #save_weight.append(self.__weight2)
            dense_layer_1 = tf.matmul(self.__one_hot(state), self.__weight1)
            activation_func_1 = tf.nn.relu(dense_layer_1)
            dense_layer_2 = tf.matmul(activation_func_1, self.__weight2)
            activation_func_2 = tf.nn.relu(dense_layer_2)
            q_value = np.array(activation_func_2.numpy())
            save_weight.append(self.__weight2)


            # e-greedy, noise
            # act = np.argmax(q_value)
            act = self._get_action_noise(q_value[0], idx=idx, greedy=greedy, noise=noise)
            state_next, reward, done, _ = self.env.step(act)


            buffer.append((state, act, reward, state_next, done))

#--------------------------------------------------------------------------------------
            #buffer가 다 찼을 때, 이후의 random.sample 과정을 실행합니다
            if len(buffer) >self.__deque_maxlen:
                pass
            if self.__num_episodes % self.__random_sample ==1:
                for _ in range(self.__num_sample):
                    sample = random.sample(buffer, self.__num_choice)


                    for state, act, reward, state_next, done in sample:
                        if done:
                            # Update Q, and no q_value+1, since it's action termial state
                            q_value[0, act] = reward
                        else:
                            # input(1,16) * W1(16,10) -> (1,10)
                            next_dense_layer_1 = tf.matmul(self.__one_hot(state_next), self.__weight1)
                            activation_func_1 = tf.nn.relu(next_dense_layer_1)

                            # input(1,10) * W2(10,4) -> (1,4)
                            next_dense_layer_2 = tf.matmul(activation_func_1, self.__weight2)
                            activation_func_2 = tf.nn.relu(next_dense_layer_2)

                            q_score_next = np.array(activation_func_2.numpy())

                            q_value[0, act] = reward + discount * np.max(q_score_next)
#--------------------------------------------------------------------------------------

                    Qpred_dense_layer_1 = tf.matmul(self.__one_hot(state), self.__weight1)
                    Qpred_activation_1 = tf.nn.relu(Qpred_dense_layer_1)

                    loss = lambda: tf.reduce_sum(input_tensor=tf.square(q_value - tf.nn.relu(tf.matmul(Qpred_activation_1, W2))))
                    self.__optimizer.minimize(loss, var_list=self.__weight)

                    state = state_next

        return True


    def __one_hot(self, x):
        return np.identity(self.__input_size)[x:x + 1].astype(np.float32)


    # dense_layer_1 + activation_func_1 + dense_layer_2 + activation_func_2
    def __dense_activation1(self, state):
        dense_layer_1 = tf.matmul(self.__one_hot(state), self.__weight1)
        activation_func_1 = tf.nn.relu(dense_layer_1)
        return activation_func_1

    def __dense_activation2(self, state):
        dense_layer_2 = tf.matmul(state, self.__weight2)
        activation_func_2 = tf.nn.relu(dense_layer_2)
        return activation_func_2