In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.layers as kl
import tensorflow.keras.optimizers as ko
from collections import deque
from tqdm import tqdm, notebook  # 학습 과정을 더 깔끔하게 보여주는 library 입니다.

In [14]:
env = gym.make('LunarLander-v2')

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

# reset은 매 episode가 시작할 때마다 호출해야 합니다.
obs = env.reset()

# random한 action을 뽑아 환경에 적용합니다..
action = env.action_space.sample()
print("Sampled action: {}\n".format(action))
obs, reward, done, info = env.step(action)

# info는 현재의 경우 비어있는 dict지만 debugging과 관련된 정보를 포함할 수 있습니다.
# reward는 scalar 값 입니다.
print("obs : {}\nreward : {}\ndone : {}\ninfo : {}\n".format(obs, reward, done, info))

# 한 episode 에 대한 testing
obs, done, ep_reward = env.reset(), False, 0
print(obs.shape)
print(obs[None].shape)
print(obs.reshape(-1,8).shape)

# 대부분의 gym 환경은 다음과 같은 흐름으로 진행됩니다.
while True: 
    action = env.action_space.sample() # action 선택
    obs, reward, done, info = env.step(action)  # 환경에 action 적용
    ep_reward += reward
    if done:  # episode 종료 여부 체크
        break
        
env.close()  
# Cartpole에서 reward = episode 동안 지속된 step 을 뜻합니다.
print("episode reward : ", ep_reward) 


Observation space: Box(-inf, inf, (8,), float32)
Action space: Discrete(4)
Sampled action: 1

obs : [ 0.01462584  1.4113343   0.73322177 -0.00383807 -0.01468339 -0.1233385
  0.          0.        ]
reward : 0.5731387702425945
done : False
info : {}

(8,)
(1, 8)
(1, 8)
episode reward :  -91.73367128294416


In [20]:
units=[64, 64]         # network의 구조. [32, 32]로 설정시 두개의 hidden layer에 32개의 node로 구성된 network가 생성
epsilon=1.0            # epsilon의 초기 값
min_epsilon=.01        # epsilon의 최솟값
epsilon_decay=0.9995    # 매 step마다 epsilon이 줄어드는 비율 
train_nums=100000        # train이 진행되는 총 step
gamma=0.99             # discount factor
start_learning = 100
train_iter = 4

buffer_size=10000        # Replay buffer의 size
batch_size=65        # Repaly buffer로 부터 가져오는 transition minbatch의 크기

target_update_iter=200 # Target network가 update 되는 주기 (step 기준)

In [21]:
# Neural Network Model 
class Model(tf.keras.Model):
    def __init__(self, num_actions, units=[64, 64]):
        super().__init__()
        self.fc1 = kl.Dense(units[0], activation='relu', kernel_initializer='he_uniform')
        self.fc2 = kl.Dense(units[1], activation='relu', kernel_initializer='he_uniform')
        self.logits = kl.Dense(num_actions, name='q_values')

    # forward propagation
    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.logits(x)
        return x

    # return best action that maximize action-value (Q) from network
    # a* = argmax_a' Q(s, a')
    def action_value(self, obs):
        q_values = self.predict(obs)
        best_action = np.argmax(q_values, axis=-1)
        return best_action[0]

In [22]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque(maxlen=buffer_size) 

    # store transition of each step in replay buffer
    def store(self, s, a, r, next_s, d):
        experience = (s, a, r, d, next_s)
        self.buffer.append(experience)
        self.count += 1

    # Sample random minibatch of transtion
    def sample(self, batch_size):
        batch = []
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch, a_batch, r_batch, d_batch, s2_batch = map(np.array, list(zip(*batch)))
        return s_batch, a_batch, r_batch, s2_batch, d_batch
    
    def clear(self):
        self.buffer.clear()
        self.count = 0

In [23]:
# test before train
epi_rewards = []
n_episodes = 5
for i in range(n_episodes):
    obs, done, epi_reward = env.reset(), False, 0.0 
    while not done:
        #action = 0
        action = np.random.randint(4)
        next_obs, reward, done, _ = env.step(action)
        epi_reward += reward
        obs = next_obs
    
    print("{} episode reward : {}".format(i, epi_reward))
    epi_rewards.append(epi_reward)

mean_reward = np.mean(epi_rewards)
std_reward = np.std(epi_rewards)

print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

0 episode reward : -262.51413139563044
1 episode reward : -157.8138769723583
2 episode reward : -96.32884837926943
3 episode reward : -97.30311951324151
4 episode reward : -401.6219973851469
mean_reward : -203.12 +/- 116.26


In [None]:
replay_buffer = ReplayBuffer(buffer_size)
network = Model(4)
target_network = Model(4)
target_network.set_weights(network.get_weights()) # initialize target network weight 
opt = ko.Adam(learning_rate=.0005, clipvalue=10.0)  # do gradient clip
network.compile(optimizer=opt, loss='mse')

obs = env.reset()
epi_reward = 0.0
epi = 0 # number of episode taken
epsilon=1.0
avg_reward = deque(maxlen=10)

for t in notebook.tqdm(range(1, train_nums+1), desc='train with DQN'):
    # epsilon update
    if epsilon > min_epsilon:
        epsilon = max(epsilon * epsilon_decay, min_epsilon)

    #######################  step 1  ####################### 
    ####        Select action using episolon-greedy      ### 
    ########################################################   

    # select action that maximize Q value f
    
    best_action = network.action_value(obs[None])  # input the obs to the network model // obs : (4, ) -> obs[None] : (1, 4)
    
    # e-greedy
    if np.random.rand() < epsilon:
        action = env.action_space.sample()
    else:
        action = best_action   # with prob. epsilon, select a random action
    
    #######################  step 2  ####################### 
    #### Take step and store transition to replay buffer ### 
    ########################################################
    
    next_obs, reward, done, _ = env.step(action)    # Excute action in the env to return s'(next state), r, done
    epi_reward += reward
    replay_buffer.store(obs, action, reward, next_obs, done)
    
    #######################  step 3  ####################### 
    ####     Train network (perform gradient descent)    ### 
    ########################################################
    if t > start_learning and t % train_iter == 0 :
        # target value 계산
        # np.amax -> list 에서 가장 큰 값 반환
        s_batch, a_batch, r_batch, ns_batch, done_batch = replay_buffer.sample(batch_size)
        best_action_idxes = network.action_value(ns_batch)
        target_q = target_network.predict(ns_batch)
        target_q = r_batch + gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] *  (1- done_batch)  
        q_values = network.predict(s_batch) 
        for i, action in enumerate(a_batch):
            q_values[i][action] = target_q[i]

        network.train_on_batch(s_batch, q_values)
    
    #######################  step 3  ####################### 
    ####             Update target network               ### 
    ########################################################
      
    if t % target_update_iter == 0:
        target_network.set_weights(network.get_weights()) # assign the current network parameters to target network
 
    obs = next_obs  # s <- s'
    # if episode ends (done)
    if done:
        epi += 1 # num of episode 
        avg_reward.append(epi_reward)
        if epi % 5 == 0:
            print("[Episode {:>5}] avg reward: {:>7.2f}  --eps : {:>4.2f} --steps : {:>5}".format(epi, np.mean(avg_reward), epsilon, t))
        obs, done, epi_reward = env.reset(), False, 0.0  # Environmnet reset
            

HBox(children=(FloatProgress(value=0.0, description='train with DQN', max=100000.0, style=ProgressStyle(descri…

[Episode     5] avg reward:  -77.84  --eps : 0.80 --steps :   454
[Episode    10] avg reward: -205.03  --eps : 0.61 --steps :   995
[Episode    15] avg reward: -258.41  --eps : 0.44 --steps :  1625
[Episode    20] avg reward: -235.55  --eps : 0.32 --steps :  2275
[Episode    25] avg reward: -234.35  --eps : 0.15 --steps :  3831
[Episode    30] avg reward: -214.38  --eps : 0.06 --steps :  5676
[Episode    35] avg reward: -207.39  --eps : 0.03 --steps :  6954
[Episode    40] avg reward: -162.87  --eps : 0.01 --steps : 11565
[Episode    45] avg reward: -127.46  --eps : 0.01 --steps : 16565
[Episode    50] avg reward: -101.01  --eps : 0.01 --steps : 21565
[Episode    55] avg reward:  -98.50  --eps : 0.01 --steps : 26565
[Episode    60] avg reward: -110.21  --eps : 0.01 --steps : 31565
[Episode    65] avg reward: -115.45  --eps : 0.01 --steps : 36326
[Episode    70] avg reward: -119.19  --eps : 0.01 --steps : 41326
[Episode    75] avg reward: -100.28  --eps : 0.01 --steps : 46326
[Episode  

In [None]:
# test after train
epi_rewards = []
n_episodes = 5
for i in range(n_episodes):
    obs, done, epi_reward = env.reset(), False, 0.0 
    while not done:
        action = network.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4)
        next_obs, reward, done, _ = env.step(action)
        epi_reward += reward
        obs = next_obs
    
    print("{} episode reward : {}".format(i, epi_reward))
    epi_rewards.append(epi_reward)

mean_reward = np.mean(epi_rewards)
std_reward = np.std(epi_rewards)

print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML

env = gym.make('LunarLander-v2')
env = Monitor(env, './video', force=True)
epi_reward = 0
obs = env.reset()
while True:
    action = network.action_value(obs[None])
    obs, reward, done, _ = env.step(action)
    epi_reward += reward
    if done:
        print("episode reward : {}".format(epi_reward))
        break
env.close()

In [None]:
video = io.open('./video/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))