In [8]:
import copy
import pylab
import numpy as np
import sys
if "../" not in sys.path:
    sys.path.append("../") 
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras import backend as K
import gym
import threading
import time

In [9]:
global episode
episode = 0
EPISODES = 500
env_name = "CartPole-v1"

In [50]:
class A3CGobal:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1
        self.actor_lr = 0.001
        self.critic_lr = 0.005
        self.discount_factor = 0.99
        
        # 스레드 갯수
        self.num_thread = 2
        
        # Global_network의 actor /  critic
        self.g_actor = self.build_actor()
        self.g_critic = self.build_critic()
        
        self.g_actor_opt = self.actor_optimizer()
        self.g_critic_opt = self.critic_optimizer()
        
    def build_actor(self):
        input_layer = Input(shape=(self.state_size,))
        x = Dense(24, activation='relu',
                 kernel_initializer='he_uniform')(input_layer)
        output = Dense(self.action_size, activation='softmax',
                      kernel_initializer='he_uniform')(x)
        
        model = Model(inputs=input_layer, outputs=output)
        
        model._make_predict_function()
        
        return model
    
    def build_critic(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.state_size, ), activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(12, activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(self.value_size, activation='linear',
                       kernel_initializer='he_uniform'))
        
        model._make_predict_function()
        
        return model
        
    def actor_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        td_error = K.placeholder(shape=[None, ])
        #discounted_rewards = K.placeholder(shape=[None, ])
        
        # 크로스 엔트로피 오류함수 계산
        action_prob = K.sum(action * self.g_actor.output, axis=1)
        cross_entropy = K.log(action_prob) * td_error
        loss = -K.sum(cross_entropy)
        
        # 정책신경망을 업데이트하는 훈련함수 생성
        optimizer = Adam(lr=self.actor_lr)
        updates = optimizer.get_updates(self.g_actor.trainable_weights, [], loss)
        train = K.function([self.g_actor.input, action, td_error], [],
                           updates=updates)

        return train
    
    def critic_optimizer(self):
        target = K.placeholder(shape=[None, ])
        loss = K.mean(K.square(target - self.g_critic.output))
        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.g_critic.trainable_weights, [], loss)
        train = K.function([self.g_critic.input, target], [],
                           updates=updates)
        
        return train
    
    def train(self):
        actor_learners = [ActorLearner(self.state_size, self.action_size, self.value_size,
                                      self.g_actor, self.g_critic,
                                      self.g_actor_opt, self.g_critic_opt,
                                      self.discount_factor)
                         for _ in range(self.num_thread)]
        
        # 각 스레드 시작
        for actor_learner in actor_learners:
            time.sleep(1)
            actor_learner.start()

In [51]:
class ActorLearner(threading.Thread):
    def __init__(self, state_size, action_size, value_size, global_actor, global_critic,
                g_actor_opt, g_critic_opt, discount_factor):
        threading.Thread.__init__(self)
        
        self.discount_factor = discount_factor
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = value_size
        
        # Global_network의 actor /  critic
        self.g_actor = global_actor
        self.g_critic = global_critic
        
        self.g_actor_opt = g_actor_opt
        self.g_critic_opt = g_critic_opt
        
        # local model
        self.local_actor = self.build_local_actor()
        self.local_critic = self.build_local_critic()
        
        # 지정된 타임스텝 동안 샘플을 저장할 리스트
        self.states, self.actions, self.rewards = [], [], []
        
        # 모델 업데이트 주기
        self.t = 0
        self.t_max = 20
    
    def build_local_actor(self):
        input_layer = Input(shape=(self.state_size,))
        x = Dense(24, activation='relu',
                 kernel_initializer='he_uniform')(input_layer)
        output = Dense(self.action_size, activation='softmax',
                      kernel_initializer='he_uniform')(x)
        
        local_actor = Model(inputs=input_layer, outputs=output)
        
        local_actor._make_predict_function()
        local_actor.set_weights(self.g_actor.get_weights())
        
        return local_actor
    
    def build_local_critic(self):
        local_critic = Sequential()
        local_critic.add(Dense(24, input_shape=(self.state_size, ), activation='relu',
                       kernel_initializer='he_uniform'))
        local_critic.add(Dense(12, activation='relu',
                       kernel_initializer='he_uniform'))
        local_critic.add(Dense(self.value_size, activation='linear',
                       kernel_initializer='he_uniform'))
        
        local_critic._make_predict_function()
        local_critic.set_weights(self.g_critic.get_weights())
        
        return local_critic
    
    def train_global_model(self):
        
        states = np.array(self.states)
        actions = np.array(self.actions)
        values = self.local_critic.predict(states)[0]
        discounted_rewards = self.discount_rewards(self.rewards)
        advantages = discounted_rewards - value
        
        self.g_actor_opt([states, actions, advantages])
        self.g_critic_opt([states, discounted_rewards])
        
        self.states, self.actions, self.rewards = [], [], []
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    
    def update_local_model(self):
        self.local_actor.set_weights(self.g_actor.get_weights())
        self.local_critic.set_weights(self.g_critic.get_weights())
    
    
    def get_action(self, state):
        policy = self.local_actor.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
    
    def run(self):
        global episode
        env = gym.make(env_name)
        
        step = 0
        
        while episode < EPISODES:
            done = False
            score = 0
            state = env.reset()
            state = np.reshape(state, [1,4])

            while not done:
                #env.render()
                print("!!!!")
                step += 1
                self.t += 1
                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1,4])
                reward = reward if not done or score == 499 else -100
                agent.append_sample(state, action, reward)

                score += reward
                state = next_state
                
                if self.t >= self.t_max or done:
                    self.train_global_model()
                    self.update_local_model()
                    self.t = 0
                    
                if done:
                    episode += 1
                    score = score if score == 500.0 else score + 100
                    print("Episode: ", episode, " Score: ", score, " step: ", step)
                    step = 0

In [52]:
if __name__ == "__main__":
    global_agent = A3CGobal(state_size=4, action_size=2)
    global_agent.train()

!!!!


Exception in thread Thread-56:
Traceback (most recent call last):
  File "C:\Users\gusgk\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-51-7d0aa3a18687>", line 113, in run
    action = self.get_action(state)
  File "<ipython-input-51-7d0aa3a18687>", line 85, in get_action
    policy = self.local_actor.predict(state)[0]
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\keras\engine\training.py", line 1462, in predict
    callbacks=callbacks)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\keras\engine\training_arrays.py", line 324, in predict_loop
    batch_outs = f(ins_batch)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3073, in __call__
    self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3019, in _make_callable
    callable_fn = session._make_callable_from_options(callable

!!!!


Exception in thread Thread-57:
Traceback (most recent call last):
  File "C:\Users\gusgk\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-51-7d0aa3a18687>", line 113, in run
    action = self.get_action(state)
  File "<ipython-input-51-7d0aa3a18687>", line 85, in get_action
    policy = self.local_actor.predict(state)[0]
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\keras\engine\training.py", line 1462, in predict
    callbacks=callbacks)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\keras\engine\training_arrays.py", line 324, in predict_loop
    batch_outs = f(ins_batch)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3073, in __call__
    self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
  File "C:\Users\gusgk\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py", line 3019, in _make_callable
    callable_fn = session._make_callable_from_options(callable

In [34]:
env = gym.make(env_name)
state1 = env.reset()
state1

array([ 0.00674005, -0.04818258, -0.01514307,  0.02286146])

In [35]:
input_layer = Input(shape=(4,))
x = Dense(24, activation='relu',
         kernel_initializer='he_uniform')(input_layer)
output = Dense(2, activation='softmax',
              kernel_initializer='he_uniform')(x)

model1 = Model(inputs=input_layer, outputs=output)

In [38]:
state1 = np.reshape(state1, [1,4])
a=model1.predict(state1)[0]

In [47]:
np.random.choice(2, 1, p=a)[0]

1