In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.layers as kl
import tensorflow.keras.optimizers as ko
from collections import deque
from tqdm import tqdm, notebook  # 학습 과정을 더 깔끔하게 보여주는 library 입니다.

In [2]:
env = gym.make('MountainCar-v0')

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

# reset은 매 episode가 시작할 때마다 호출해야 합니다.
obs = env.reset()

# random한 action을 뽑아 환경에 적용합니다..
action = env.action_space.sample()
print("Sampled action: {}\n".format(action))
obs, reward, done, info = env.step(action)

# info는 현재의 경우 비어있는 dict지만 debugging과 관련된 정보를 포함할 수 있습니다.
# reward는 scalar 값 입니다.
print("obs : {}\nreward : {}\ndone : {}\ninfo : {}\n".format(obs, reward, done, info))

# 한 episode 에 대한 testing
obs, done, ep_reward = env.reset(), False, 0

# 대부분의 gym 환경은 다음과 같은 흐름으로 진행됩니다.
while True: 
    action = env.action_space.sample() # action 선택
    obs, reward, done, info = env.step(action)  # 환경에 action 적용
    ep_reward += reward
    if done:  # episode 종료 여부 체크
        break
        
env.close()  
# Cartpole에서 reward = episode 동안 지속된 step 을 뜻합니다.
print("episode reward : ", ep_reward) 

Observation space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
Action space: Discrete(3)
Sampled action: 0

obs : [-0.45583772 -0.00151584]
reward : -1.0
done : False
info : {}

episode reward :  -200.0


In [3]:
# Neural Network Model 
class Model(tf.keras.Model):
    def __init__(self, num_actions):
        super().__init__()
        self.fc1 = kl.Dense(64, activation='relu', kernel_initializer='he_uniform')
        #self.fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform')
        self.logits = kl.Dense(num_actions, name='q_values')

    # forward propagation
    def call(self, inputs):
        x = self.fc1(inputs)
        #x = self.fc2(x)
        x = self.logits(x)
        return x

    # return best action that maximize action-value (Q) from network
    # a* = argmax_a' Q(s, a')
    def action_value(self, obs):
        q_values = self.predict(obs)
        best_action = np.argmax(q_values, axis=-1)
        return best_action if best_action.shape[0] > 1 else best_action[0] 

class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque(maxlen=buffer_size) 

    # store transition of each step in replay buffer
    def store(self, s, a, r, next_s, d):
        experience = (s, a, r, d, next_s)
        self.buffer.append(experience)
        self.count += 1

    # Sample random minibatch of transtion
    def sample(self, batch_size):
        batch = []
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch, a_batch, r_batch, d_batch, s2_batch = map(np.array, list(zip(*batch)))
        return s_batch, a_batch, r_batch, s2_batch, d_batch

In [4]:
# test before train
epi_rewards = []
n_episodes = 5
for i in range(n_episodes):
    obs, done, epi_reward = env.reset(), False, 0.0 
    while not done:
        #action = 0
        action = np.random.randint(3)
        next_obs, reward, done, _ = env.step(action)
        epi_reward += reward
        obs = next_obs
    
    print("{} episode reward : {}".format(i, epi_reward))
    epi_rewards.append(epi_reward)

mean_reward = np.mean(epi_rewards)
std_reward = np.std(epi_rewards)

print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

0 episode reward : -200.0
1 episode reward : -200.0
2 episode reward : -200.0
3 episode reward : -200.0
4 episode reward : -200.0
mean_reward : -200.00 +/- 0.00


In [5]:
epsilon=1.0            # epsilon의 초기 값
min_epsilon=.02        # epsilon의 최솟값
epsilon_decay=0.9997    # 매 step마다 epsilon이 줄어드는 비율 
train_nums=50000        # train이 진행되는 총 step
gamma=0.95             # discount factor
start_learning = 100

buffer_size=2000        # Replay buffer의 size
batch_size=64        # Repaly buffer로 부터 가져오는 transition minbatch의 크기

target_update_iter=200 # Target network가 update 되는 주기 (step 기준)

In [6]:
replay_buffer = ReplayBuffer(buffer_size)
network = Model(3)
target_network = Model(3)
target_network.set_weights(network.get_weights()) # initialize target network weight 
opt = ko.Adam(learning_rate=.0015)
network.compile(optimizer=opt, loss='mse')

obs = env.reset()
epi_reward = 0.0
epi = 0 # number of episode taken
epsilon=1.0
avg_reward = deque(maxlen=10)

for t in notebook.tqdm(range(1, train_nums+1), desc='train with DQN'):
    # epsilon update
    if epsilon > min_epsilon and t % 2 ==0:
        epsilon = max(epsilon * epsilon_decay, min_epsilon)

    #######################  step 1  ####################### 
    ####        Select action using episolon-greedy      ### 
    ########################################################   

    # select action that maximize Q value f
    
    best_action = network.action_value(np.atleast_2d(obs))  # input the obs to the network model // obs : (4, ) -> np.atleast_2d(obs) : (1, 4)
    
    # e-greedy
    if np.random.rand() < epsilon :
        action = env.action_space.sample()
    else:
        action = best_action   # with prob. epsilon, select a random action
    
    #######################  step 2  ####################### 
    #### Take step and store transition to replay buffer ### 
    ########################################################
    
    next_obs, reward, done, _ = env.step(action)    # Excute action in the env to return s'(next state), r, done
    epi_reward += reward
    if next_obs[0] > -0.2:
        #print("next ", next_obs[0])
        if next_obs[0] >= 0.5:
            reward += 10
        elif next_obs[0] > 0.3:
            reward += 3
        else:
            reward += (next_obs[0] + 0.5)
    replay_buffer.store(obs, action, reward, next_obs, done)
    
    #######################  step 3  ####################### 
    ####     Train network (perform gradient descent)    ### 
    ########################################################
    if t > start_learning and t % 5 == 0:
        # target value 계산
        # np.amax -> list 에서 가장 큰 값 반환
        #s_batch, a_batch, r_batch, ns_batch, done_batch = replay_buffer.sample(batch_size)
        #best_action_idxes = network.action_value(ns_batch)
        #target_q = target_network.predict(ns_batch)
        #target_q = r_batch + gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] *  (1- done_batch)  
        #print(r_batch)
        s_batch, a_batch, r_batch, ns_batch, done_batch = replay_buffer.sample(batch_size)
        target_q = r_batch + gamma * np.amax(target_network.predict(ns_batch), axis=1) * (1- done_batch)  
        q_values = network.predict(s_batch) 
        for i, action in enumerate(a_batch):
            q_values[i][action] = target_q[i]

        network.train_on_batch(s_batch, q_values)
    
    #######################  step 3  ####################### 
    ####             Update target network               ### 
    ########################################################
      
    if t % target_update_iter == 0:
        target_network.set_weights(network.get_weights()) # assign the current network parameters to target network
 
    obs = next_obs  # s <- s'
    # if episode ends (done)
    if done:
        epi += 1 # num of episode 
        avg_reward.append(epi_reward)
        if epi % 10 == 0:
            print("[Episode {:>5}] avg reward: {:>7.2f}  --eps : {:>4.2f} --steps : {:>5}".format(epi, np.mean(avg_reward), epsilon, t))
        obs, done, epi_reward = env.reset(), False, 0.0  # Environmnet reset
            

HBox(children=(FloatProgress(value=0.0, description='train with DQN', max=50000.0, style=ProgressStyle(descrip…




KeyboardInterrupt: 

In [12]:
network.load_weights("MountainCar_first_touch")
replay_buffer = ReplayBuffer(buffer_size)
# test after train
epi_rewards = []
n_episodes = 10
for i in range(n_episodes):
    obs, done, epi_reward = env.reset(), False, 0.0 
    while not done:
        action = network.action_value(np.atleast_2d(obs))
        next_obs, reward, done, _ = env.step(action)
        epi_reward += reward
        replay_buffer.store(obs, action, reward, next_obs, done)
        obs = next_obs
    
    print("{} episode reward : {}".format(i, epi_reward))
    epi_rewards.append(epi_reward)

mean_reward = np.mean(epi_rewards)
std_reward = np.std(epi_rewards)

print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

0 episode reward : -153.0
1 episode reward : -143.0
2 episode reward : -165.0
3 episode reward : -171.0
4 episode reward : -143.0
5 episode reward : -200.0
6 episode reward : -200.0
7 episode reward : -200.0
8 episode reward : -200.0
9 episode reward : -142.0
mean_reward : -171.70 +/- 24.75


In [None]:
target_network.set_weights(network.get_weights()) # initialize target network weight 
obs = env.reset()
epi_reward = 0.0
epi = 0 # number of episode taken
epsilon=0.1
avg_reward = deque(maxlen=10)

for t in notebook.tqdm(range(1, train_nums+1), desc='train with DQN'):
    # epsilon update
    if epsilon > min_epsilon:
        epsilon = max(epsilon * epsilon_decay, min_epsilon)

    #######################  step 1  ####################### 
    ####        Select action using episolon-greedy      ### 
    ########################################################   

    # select action that maximize Q value f
    
    best_action = network.action_value(np.atleast_2d(obs))  # input the obs to the network model // obs : (4, ) -> np.atleast_2d(obs) : (1, 4)
    
    # e-greedy
    if np.random.rand() < epsilon :
        action = env.action_space.sample()
    else:
        action = best_action   # with prob. epsilon, select a random action
    
    #######################  step 2  ####################### 
    #### Take step and store transition to replay buffer ### 
    ########################################################
    
    next_obs, reward, done, _ = env.step(action)    # Excute action in the env to return s'(next state), r, done
    epi_reward += reward
    replay_buffer.store(obs, action, reward, next_obs, done)
    
    #######################  step 3  ####################### 
    ####     Train network (perform gradient descent)    ### 
    ########################################################
    if t > start_learning and t % 5 == 0:
        # target value 계산
        # np.amax -> list 에서 가장 큰 값 반환
        #s_batch, a_batch, r_batch, ns_batch, done_batch = replay_buffer.sample(batch_size)
        #best_action_idxes = network.action_value(ns_batch)
        #target_q = target_network.predict(ns_batch)
        #target_q = r_batch + gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] *  (1- done_batch)  
        #print(r_batch)
        s_batch, a_batch, r_batch, ns_batch, done_batch = replay_buffer.sample(batch_size)
        target_q = r_batch + gamma * np.amax(target_network.predict(ns_batch), axis=1) * (1- done_batch)  
        q_values = network.predict(s_batch) 
        for i, action in enumerate(a_batch):
            q_values[i][action] = target_q[i]

        network.train_on_batch(s_batch, q_values)
    
    #######################  step 3  ####################### 
    ####             Update target network               ### 
    ########################################################
      
    if t % target_update_iter == 0:
        target_network.set_weights(network.get_weights()) # assign the current network parameters to target network
 
    obs = next_obs  # s <- s'
    # if episode ends (done)
    if done:
        epi += 1 # num of episode 
        avg_reward.append(epi_reward)
        if epi % 10 == 0:
            print("[Episode {:>5}] avg reward: {:>7.2f}  --eps : {:>4.2f} --steps : {:>5}".format(epi, np.mean(avg_reward), epsilon, t))
        obs, done, epi_reward = env.reset(), False, 0.0  # Environmnet reset
            

HBox(children=(FloatProgress(value=0.0, description='train with DQN', max=50000.0, style=ProgressStyle(descrip…

[Episode    10] avg reward: -200.00  --eps : 0.05 --steps :  2000
[Episode    20] avg reward: -199.50  --eps : 0.03 --steps :  3995
[Episode    30] avg reward: -189.20  --eps : 0.02 --steps :  5887
[Episode    40] avg reward: -192.50  --eps : 0.02 --steps :  7812


In [11]:
network = Model(3)
opt = ko.Adam(learning_rate=.0015)
network.compile(optimizer=opt, loss='mse')

obs = env.reset()
action = network.action_value(np.atleast_2d(obs))
predicted = network.predict(np.atleast_2d(obs))
network.train_on_batch(np.atleast_2d(obs), predicted)

network.load_weights('MountainCar-trained-dqn.h5')

# test after train
epi_rewards = []
n_episodes = 10
for i in range(n_episodes):
    obs, done, epi_reward = env.reset(), False, 0.0 
    while not done:
        action = network.action_value(np.atleast_2d(obs))
        next_obs, reward, done, _ = env.step(action)
        epi_reward += reward
        obs = next_obs
    
    print("{} episode reward : {}".format(i, epi_reward))
    epi_rewards.append(epi_reward)

mean_reward = np.mean(epi_rewards)
std_reward = np.std(epi_rewards)

print(f"mean_reward : {mean_reward:.2f} +/- {std_reward:.2f}")

0 episode reward : -159.0
1 episode reward : -181.0
2 episode reward : -157.0
3 episode reward : -181.0
4 episode reward : -159.0
5 episode reward : -97.0
6 episode reward : -92.0
7 episode reward : -157.0
8 episode reward : -100.0
9 episode reward : -92.0
mean_reward : -137.50 +/- 35.58


In [None]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML

env = gym.make('MountainCar-v0')
env = Monitor(env, './video', force=True)
epi_reward = 0
obs = env.reset()
while True:
    #action = network.action_value(np.atleast_2d(obs))
    action = np.random.randint(2) 
    obs, reward, done, _ = env.step(action)
    epi_reward += reward
    if done:
        print("episode reward : {}".format(epi_reward))
        break
env.close()

In [None]:
video = io.open('./video/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))