In [5]:
import copy
import pylab
import numpy as np
import sys
if "../" not in sys.path:
    sys.path.append("../") 
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras import backend as K
import gym
import threading
import time
import tensorflow as tf
import keras

In [6]:
global episode
episode = 0
global scores
scores = []
EPISODES = 1000
env_name = "CartPole-v1"

In [7]:
class A3CGobal:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1
        self.actor_lr = 0.001
        self.critic_lr = 0.005
        self.discount_factor = 0.99
        
        # 스레드 갯수
        self.num_thread = 2
        
        # Global_network의 actor /  critic
        self.g_actor = self.build_actor()
        self.g_critic = self.build_critic()
        
        self.g_actor_opt = self.actor_optimizer()
        self.g_critic_opt = self.critic_optimizer()
        
    def build_actor(self):
        input_layer = Input(shape=(self.state_size,))
        x = Dense(24, activation='relu',
                 kernel_initializer='he_uniform')(input_layer)
        output = Dense(self.action_size, activation='softmax',
                      kernel_initializer='he_uniform')(x)
        
        model = Model(inputs=input_layer, outputs=output)
        
        model._make_predict_function()
        
        return model
    
    def build_critic(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.state_size, ), activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(12, activation='relu',
                       kernel_initializer='he_uniform'))
        model.add(Dense(self.value_size, activation='linear',
                       kernel_initializer='he_uniform'))
        
        model._make_predict_function()
        
        return model
        
    def actor_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        advantage = K.placeholder(shape=[None, ])
        #discounted_rewards = K.placeholder(shape=[None, ])
        
        # 크로스 엔트로피 오류함수 계산
        action_prob = K.sum(action * self.g_actor.output, axis=1)
        cross_entropy = K.log(action_prob) * advantage
        loss = -K.sum(cross_entropy)
        
        # 정책신경망을 업데이트하는 훈련함수 생성
        optimizer = Adam(lr=self.actor_lr)
        updates = optimizer.get_updates(self.g_actor.trainable_weights, [], loss)
        train = K.function([self.g_actor.input, action, advantage], [],
                           updates=updates)

        return train
    
    def critic_optimizer(self):
        target = K.placeholder(shape=[None, ])
        loss = K.mean(K.square(target - self.g_critic.output))
        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.g_critic.trainable_weights, [], loss)
        train = K.function([self.g_critic.input, target], [],
                           updates=updates)
        
        return train
    
    def train(self):
        actor_learners = [ActorLearner(self.state_size, self.action_size, self.value_size,
                                      self.g_actor, self.g_critic,
                                      self.g_actor_opt, self.g_critic_opt,
                                      self.discount_factor)
                         for _ in range(self.num_thread)]
        
        # 각 스레드 시작
        for actor_learner in actor_learners:
            time.sleep(1)
            actor_learner.start()

In [8]:
class ActorLearner(threading.Thread):
    def __init__(self, state_size, action_size, value_size, global_actor, global_critic,
                g_actor_opt, g_critic_opt, discount_factor):
        threading.Thread.__init__(self)
        
        self.discount_factor = discount_factor
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = value_size
        
        # Global_network의 actor /  critic
        self.g_actor = global_actor
        self.g_critic = global_critic
        
        self.g_actor_opt = g_actor_opt
        self.g_critic_opt = g_critic_opt
        
        # local model
        self.local_actor = self.build_local_actor()
        self.local_critic = self.build_local_critic()
        
        # 지정된 타임스텝 동안 샘플을 저장할 리스트
        self.states, self.actions, self.rewards = [], [], []
        
        # 모델 업데이트 주기
        self.t = 0
        self.t_max = 20
    
    def build_local_actor(self):
        input_layer = Input(shape=(self.state_size,))
        x = Dense(24, activation='relu',
                 kernel_initializer='he_uniform')(input_layer)
        output = Dense(self.action_size, activation='softmax',
                      kernel_initializer='he_uniform')(x)
        
        local_actor = Model(inputs=input_layer, outputs=output)
        
        local_actor._make_predict_function()
        local_actor.set_weights(self.g_actor.get_weights())
        
        return local_actor
    
    def build_local_critic(self):
        local_critic = Sequential()
        local_critic.add(Dense(24, input_shape=(self.state_size, ), activation='relu',
                       kernel_initializer='he_uniform'))
        local_critic.add(Dense(12, activation='relu',
                       kernel_initializer='he_uniform'))
        local_critic.add(Dense(self.value_size, activation='linear',
                       kernel_initializer='he_uniform'))
        
        local_critic._make_predict_function()
        local_critic.set_weights(self.g_critic.get_weights())
        
        return local_critic
    
    def train_global_model(self):
        
        states = np.array(self.states)
        actions = np.array(self.actions)
        values = self.local_critic.predict(states)[0]
        discounted_rewards = self.discount_rewards(self.rewards)
        advantages = discounted_rewards - values
        
        self.g_actor_opt([states, actions, advantages])
        self.g_critic_opt([states, discounted_rewards])
        
        self.states, self.actions, self.rewards = [], [], []
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    
    def update_local_model(self):
        self.local_actor.set_weights(self.g_actor.get_weights())
        self.local_critic.set_weights(self.g_critic.get_weights())
    
    
    def get_action(self, state):
        policy = self.local_actor.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
    
    def run(self):
        global episode
        env = gym.make(env_name)
        
        step = 0
        
        while episode < EPISODES:
            done = False
            score = 0
            state = env.reset()
            state = np.reshape(state, [1,4])

            while not done:
                #env.render()
                step += 1
                self.t += 1
                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1,4])
                reward = reward if not done or score == 499 else -100
                self.append_sample(state, action, reward)

                score += reward
                state = next_state
                
                if self.t >= self.t_max or done:
                    self.train_global_model()
                    self.update_local_model()
                    self.t = 0
                    
                if done:
                    episode += 1
                    score = score if score == 500.0 else score + 100
                    scores.append(score)
                    print("Episode: ", episode, " Score: ", score, " step: ", step)
                    step = 0
                    
                    if np.mean(scores[-min(10,len(scores)):]) > 490:
                        sys.exit()

In [9]:
if __name__ == "__main__":
    global_agent = A3CGobal(state_size=4, action_size=2)
    global_agent.train()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Episode:  1  Score:  12.0  step:  13
Episode:  2  Score:  25.0  step:  26
Episode:  3  Score:  8.0  step:  9
Episode:  4  Score:  14.0  step:  15
Episode:  5  Score:  8.0  step:  9
Episode:  6  Score:  17.0  step:  18
Episode:  7  Score:  13.0  step:  14
Episode:  8  Score:  8.0  step:  9
Episode:  9  Score:  22.0  step:  23
Episode:  10  Score:  14.0  step:  15
Episode:  11  Score:  11.0  step:  12
Episode:  12  Score:  16.0  step:  17
Episode:  13  Score:  9.0  step:  10
Episode:  14  Score:  22.0  step:  23
Episode:  15  Score:  33.0  step:  34
Episode:  16  Score:  17.0  step:  18
Episode:  17  Score:  9.0  step:  10
Episode:  18  Score:  19.0  step:  20
Episode:  19  Score:  32.0  step:  33
Episode:  20  Score:  9.0  step:  10
Episode:  21  Score:  7.0  step:  8
Episode:  22  Score:  10.0  step:  11
Episode:  23  Score:  27.0  step:  28
Episode:  24  Score:  19.0

Episode:  201  Score:  18.0  step:  19
Episode:  202  Score:  11.0  step:  12
Episode:  203  Score:  12.0  step:  13
Episode:  204  Score:  34.0  step:  35
Episode:  205  Score:  32.0  step:  33
Episode:  206  Score:  15.0  step:  16
Episode:  207  Score:  17.0  step:  18
Episode:  208  Score:  16.0  step:  17
Episode:  209  Score:  20.0  step:  21
Episode:  210  Score:  38.0  step:  39
Episode:  211  Score:  18.0  step:  19
Episode:  212  Score:  10.0  step:  11
Episode:  213  Score:  9.0  step:  10
Episode:  214  Score:  14.0  step:  15
Episode:  215  Score:  36.0  step:  37
Episode:  216  Score:  40.0  step:  41
Episode:  217  Score:  28.0  step:  29
Episode:  218  Score:  24.0  step:  25
Episode:  219  Score:  13.0  step:  14
Episode:  220  Score:  10.0  step:  11
Episode:  221  Score:  24.0  step:  25
Episode:  222  Score:  25.0  step:  26
Episode:  223  Score:  15.0  step:  16
Episode:  224  Score:  25.0  step:  26
Episode:  225  Score:  45.0  step:  46
Episode:  226  Score:  14.

Episode:  411  Score:  98.0  step:  99
Episode:  412  Score:  111.0  step:  112
Episode:  413  Score:  186.0  step:  187
Episode:  414  Score:  145.0  step:  146
Episode:  415  Score:  13.0  step:  14
Episode:  416  Score:  226.0  step:  227
Episode:  417  Score:  232.0  step:  233
Episode:  418  Score:  282.0  step:  283
Episode:  419  Score:  213.0  step:  214
Episode:  420  Score:  212.0  step:  213
Episode:  421  Score:  220.0  step:  221
Episode:  422  Score:  87.0  step:  88
Episode:  423  Score:  95.0  step:  96
Episode:  424  Score:  168.0  step:  169
Episode:  425  Score:  256.0  step:  257
Episode:  426  Score:  138.0  step:  139
Episode:  427  Score:  113.0  step:  114
Episode:  428  Score:  105.0  step:  106
Episode:  429  Score:  96.0  step:  97
Episode:  430  Score:  77.0  step:  78
Episode:  431  Score:  83.0  step:  84
Episode:  432  Score:  77.0  step:  78
Episode:  433  Score:  51.0  step:  52
Episode:  434  Score:  134.0  step:  135
Episode:  435  Score:  211.0  step

Episode:  613  Score:  251.0  step:  252
Episode:  614  Score:  231.0  step:  232
Episode:  615  Score:  184.0  step:  185
Episode:  616  Score:  246.0  step:  247
Episode:  617  Score:  212.0  step:  213
Episode:  618  Score:  209.0  step:  210
Episode:  619  Score:  312.0  step:  313
Episode:  620  Score:  279.0  step:  280
Episode:  621  Score:  254.0  step:  255
Episode:  622  Score:  191.0  step:  192
Episode:  623  Score:  190.0  step:  191
Episode:  624  Score:  136.0  step:  137
Episode:  625  Score:  210.0  step:  211
Episode:  626  Score:  148.0  step:  149
Episode:  627  Score:  241.0  step:  242
Episode:  628  Score:  141.0  step:  142
Episode:  629  Score:  143.0  step:  144
Episode:  630  Score:  367.0  step:  368
Episode:  631  Score:  179.0  step:  180
Episode:  632  Score:  142.0  step:  143
Episode:  633  Score:  187.0  step:  188
Episode:  634  Score:  296.0  step:  297
Episode:  635  Score:  178.0  step:  179
Episode:  636  Score:  185.0  step:  186
Episode:  637  S

Episode:  814  Score:  213.0  step:  214
Episode:  815  Score:  359.0  step:  360
Episode:  816  Score:  210.0  step:  211
Episode:  817  Score:  227.0  step:  228
Episode:  818  Score:  236.0  step:  237
Episode:  819  Score:  192.0  step:  193
Episode:  820  Score:  304.0  step:  305
Episode:  821  Score:  242.0  step:  243
Episode:  822  Score:  344.0  step:  345
Episode:  823  Score:  201.0  step:  202
Episode:  824  Score:  197.0  step:  198
Episode:  825  Score:  285.0  step:  286
Episode:  826  Score:  211.0  step:  212
Episode:  827  Score:  330.0  step:  331
Episode:  828  Score:  206.0  step:  207
Episode:  829  Score:  333.0  step:  334
Episode:  830  Score:  371.0  step:  372
Episode:  831  Score:  340.0  step:  341
Episode:  832  Score:  302.0  step:  303
Episode:  833  Score:  233.0  step:  234
Episode:  834  Score:  294.0  step:  295
Episode:  835  Score:  269.0  step:  270
Episode:  836  Score:  245.0  step:  246
Episode:  837  Score:  345.0  step:  346
Episode:  838  S