**13-1-DQN_Cartpole.ipynb**

Original author: Rowel Atienza, "Advanced Deep Learning with Keras" book

Code 수정 by In-Kwon Lee, 2020


In [1]:
# Colab server에 내 google drive를 mount. /content/drive/My Drive 로 mount됨

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# Training된 weight 저장을 위해 temp directory를 생성해 둠
# 이미 만들어진 경우는 server가 만들 수 없다고 할 것이나, 신경쓸 필요 없음

!mkdir '/content/drive/My Drive/Colab Notebooks/temp'

mkdir: cannot create directory ‘/content/drive/My Drive/Colab Notebooks/temp’: File exists


In [0]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from collections import deque            # deque는 처음과 끝 부분에서 모두 insert/delete가 가능한 data structure
import numpy as np
import random
#import argparse                         # console에서 실행할 때는 argparse를 쓰는 것이 편함 (Atienza book original source 참조)
import gym                               # openAI gym package를 로딩
from gym import wrappers, logger         # gym package 중 wrappers와 logger 사용

In [0]:
# class DQNAgent

class DQNAgent:
    def __init__(self,
                 state_space, 
                 action_space, 
                 episodes=500):
        """DQN Agent on CartPole-v0 environment

        Arguments:
            state_space (tensor): state space
            action_space (tensor): action space
            episodes (int): number of episodes to train
        """
        self.action_space = action_space

        # experience buffer
        self.memory = []

        # discount rate
        self.gamma = 0.9

        # initially 90% exploration, 10% exploitation
        self.epsilon = 1.0
        # iteratively applying decay til 
        # 10% exploration/90% exploitation
        self.epsilon_min = 0.1
        self.epsilon_decay = self.epsilon_min / self.epsilon
        self.epsilon_decay = self.epsilon_decay ** \
                             (1. / float(episodes))

        # Q Network weights filename
        self.weights_file = 'dqn_cartpole.h5'
        # Q Network for training
        n_inputs = state_space.shape[0]
        n_outputs = action_space.n
        self.q_model = self.build_model(n_inputs, n_outputs)
        self.q_model.compile(loss='mse', optimizer=Adam())
        # target Q Network
        self.target_q_model = self.build_model(n_inputs, n_outputs)
        # copy Q Network params to target Q Network
        self.update_weights()

        self.replay_counter = 0

    
    def build_model(self, n_inputs, n_outputs):
        """Q Network is 256-256-256 MLP

        Arguments:
            n_inputs (int): input dim
            n_outputs (int): output dim

        Return:
            q_model (Model): DQN
        """
        inputs = Input(shape=(n_inputs, ), name='state')
        x = Dense(256, activation='relu')(inputs)
        x = Dense(256, activation='relu')(x)
        x = Dense(256, activation='relu')(x)
        x = Dense(n_outputs,
                  activation='linear', 
                  name='action')(x)
        q_model = Model(inputs, x)
        q_model.summary()
        return q_model


    def save_weights(self):
        """save Q Network params to a file"""
        self.q_model.save_weights(self.weights_file)


    def update_weights(self):
        """copy trained Q Network params to target Q Network"""
        self.target_q_model.set_weights(self.q_model.get_weights())


    def act(self, state):
        """eps-greedy policy
        Return:
            action (tensor): action to execute
        """
        if np.random.rand() < self.epsilon:
            # explore - do random action
            return self.action_space.sample()

        # exploit
        q_values = self.q_model.predict(state)
        # select the action with max Q-value
        action = np.argmax(q_values[0])
        return action


    def remember(self, state, action, reward, next_state, done):
        """store experiences in the replay buffer
        Arguments:
            state (tensor): env state
            action (tensor): agent action
            reward (float): reward received after executing
                action on state
            next_state (tensor): next state
        """
        item = (state, action, reward, next_state, done)
        self.memory.append(item)


    def get_target_q_value(self, next_state, reward):
        """compute Q_max
           Use of target Q Network solves the 
            non-stationarity problem
        Arguments:
            reward (float): reward received after executing
                action on state
            next_state (tensor): next state
        Return:
            q_value (float): max Q-value computed
        """
        # max Q value among next state's actions
        # DQN chooses the max Q value among next actions
        # selection and evaluation of action is 
        # on the target Q Network
        # Q_max = max_a' Q_target(s', a')
        q_value = np.amax(\
                     self.target_q_model.predict(next_state)[0])

        # Q_max = reward + gamma * Q_max
        q_value *= self.gamma
        q_value += reward
        return q_value


    def replay(self, batch_size):
        """experience replay addresses the correlation issue 
            between samples
        Arguments:
            batch_size (int): replay buffer batch 
                sample size
        """
        # sars = state, action, reward, state' (next_state)
        sars_batch = random.sample(self.memory, batch_size)
        state_batch, q_values_batch = [], []

        # fixme: for speedup, this could be done on the tensor level
        # but easier to understand using a loop
        for state, action, reward, next_state, done in sars_batch:
            # policy prediction for a given state
            q_values = self.q_model.predict(state)
            
            # get Q_max
            q_value = self.get_target_q_value(next_state, reward)

            # correction on the Q value for the action used
            q_values[0][action] = reward if done else q_value

            # collect batch state-q_value mapping
            state_batch.append(state[0])
            q_values_batch.append(q_values[0])

        # train the Q-network
        self.q_model.fit(np.array(state_batch),
                         np.array(q_values_batch),
                         batch_size=batch_size,
                         epochs=1,
                         verbose=0)

        # update exploration-exploitation probability
        self.update_epsilon()

        # copy new params on old target after 
        # every 10 training updates
        if self.replay_counter % 10 == 0:
            self.update_weights()

        self.replay_counter += 1

    
    def update_epsilon(self):
        """decrease the exploration, increase exploitation"""
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [0]:
# class DDQNAgent

class DDQNAgent(DQNAgent):
    def __init__(self,
                 state_space, 
                 action_space, 
                 episodes=500):
        super().__init__(state_space, 
                         action_space, 
                         episodes)
        """DDQN Agent on CartPole-v0 environment

        Arguments:
            state_space (tensor): state space
            action_space (tensor): action space
            episodes (int): number of episodes to train
        """

        # Q Network weights filename
        self.weights_file = 'ddqn_cartpole.h5'
        print("-------------DDQN------------")

    def get_target_q_value(self, next_state, reward):
        """compute Q_max
           Use of target Q Network solves the 
            non-stationarity problem
        Arguments:
            reward (float): reward received after executing
                action on state
            next_state (tensor): next state
        Returns:
            q_value (float): max Q-value computed
        """
        # max Q value among next state's actions
        # DDQN
        # current Q Network selects the action
        # a'_max = argmax_a' Q(s', a')
        action = np.argmax(self.q_model.predict(next_state)[0])
        # target Q Network evaluates the action
        # Q_max = Q_target(s', a'_max)
        q_value = self.target_q_model.predict(\
                                      next_state)[0][action]

        # Q_max = reward + gamma * Q_max
        q_value *= self.gamma
        q_value += reward
        return q_value

In [0]:
# Main Program Begin Here

In [5]:
# Training 된 Q-table을 저장할 output path 
outdir = "/content/drive/My Drive/Colab Notebooks/temp/dqn-CartPole-v0"

# output path가 제대로 준비되었는지 server ls 명령으로 확인
!ls "/content/drive/My Drive/Colab Notebooks/temp"

dqn-CartPole-v0  q-learning-FrozenLake-v0


In [11]:
#----------------------------------------------------------------------- 
# 실행 옵션은 여기서 수정
#-----------------------------------------------------------------------
opt_ddqn = False          # True로 하면 DDQN (Double DQN) mode
opt_norender = False      # True로 하면 video가 녹화되지 않음
opt_id = 'CartPole-v0'

# the number of trials without falling over
win_trials = 100

# the CartPole-v0 is considered solved if for 100 consecutive trials,
# the cart pole has not fallen over and it has achieved an average
# reward of 195.0
# a reward of +1 is provided for every timestep the pole remains
# upright
win_reward = { 'CartPole-v0' : 195.0 }

# stores the reward per episode
scores = deque(maxlen=win_trials)

# openAI gym의 print option을 ERROR level로 함. 
# DEBUG, INFO, WARN, ERROR, DISABLED level이 있음. 
# https://github.com/openai/gym/blob/master/gym/logger.py
logger.setLevel(logger.ERROR)

# instantiate a gym environment (CartPole-v0)
# gym이 제공하는 'CartPole-v0' environment를 하나 생성
env = gym.make('CartPole-v0')

# simple environment를 wrapper environment로 확장: N개의 이전 observation등을 buffering하여 이용하는 등의 확장 기능 제공
# Monitor: data save할 때 필요
#if opt_norender:
#    env = wrappers.Monitor(env, directory=outdir, video_callable=False, force=True)
#else:
#    env = wrappers.Monitor(env, directory=outdir, force=True)

env.seed(0)
print(env.observation_space.shape)
print(env.action_space)

(4,)
Discrete(2)


In [0]:
# instantiate the DQN/DDQN agent
if opt_ddqn:
    agent = DDQNAgent(env.observation_space, env.action_space)
else:
    agent = DQNAgent(env.observation_space, env.action_space)


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
state (InputLayer)           [(None, 4)]               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               1280      
_________________________________________________________________
dense_7 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_8 (Dense)              (None, 256)               65792     
_________________________________________________________________
action (Dense)               (None, 2)                 514       
Total params: 133,378
Trainable params: 133,378
Non-trainable params: 0
_________________________________________________________________
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output 

In [0]:
# should be solved in this number of episodes
episode_count = 3000
state_size = env.observation_space.shape[0]
batch_size = 64

In [0]:
# Q-Learning sampling and fitting
for episode in range(episode_count):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    total_reward = 0
    while not done:
        # in CartPole-v0, action=0 is left and action=1 is right
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        # in CartPole-v0:
        # state = [pos, vel, theta, angular speed]
        next_state = np.reshape(next_state, [1, state_size])
        # store every experience unit in replay buffer
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward


    # call experience relay
    if len(agent.memory) >= batch_size:
        agent.replay(batch_size)

    scores.append(total_reward)
    mean_score = np.mean(scores)
    if mean_score >= win_reward[opt_id] \
            and episode >= win_trials:
        print("Solved in episode %d: \
               Mean survival = %0.2lf in %d episodes"
              % (episode, mean_score, win_trials))
        print("Epsilon: ", agent.epsilon)
        agent.save_weights()
        break
    if (episode + 1) % win_trials == 0:
        print("Episode %d: Mean survival = \
               %0.2lf in %d episodes" %
              ((episode + 1), mean_score, win_trials))

# close the env and write monitor result info to disk
env.close()

Episode 100: Mean survival =                18.17 in 100 episodes
Episode 200: Mean survival =                43.28 in 100 episodes
Episode 300: Mean survival =                120.49 in 100 episodes
Episode 400: Mean survival =                177.19 in 100 episodes
Episode 500: Mean survival =                155.40 in 100 episodes
Episode 600: Mean survival =                178.83 in 100 episodes
Episode 700: Mean survival =                184.98 in 100 episodes
Episode 800: Mean survival =                188.66 in 100 episodes
Episode 900: Mean survival =                191.02 in 100 episodes
Episode 1000: Mean survival =                190.85 in 100 episodes
Episode 1100: Mean survival =                188.37 in 100 episodes
Episode 1200: Mean survival =                185.59 in 100 episodes
Episode 1300: Mean survival =                181.93 in 100 episodes
Episode 1400: Mean survival =                184.81 in 100 episodes
Episode 1500: Mean survival =                177.02 in 100 