In [1]:
import copy
import pylab
import numpy as np
from environment import Env
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from keras import backend as K
import tensorflow as tf

Using TensorFlow backend.


In [2]:
EPISODES = 2500

In [3]:
class ReinforceAgent:
    def __init__(self):
        self.load_model = False
        # 가능한 모든 행동 정의
        self.action_space = [0, 1, 2, 3, 4]
        # 상태와 행동의 크기 정의
        self.action_size = len(self.action_space)
        self.state_size = 15
        self.discount_factor = 0.99 
        self.learning_rate = 0.001

        self.model = self.build_model()
        self.optimizer = self.optimizer()
        self.states=[]
        self.actions=[]
        self.rewards =[]

        if self.load_model:
            self.model.load_weights('./save_model/reinforce_trained.h5')
    
    # 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.summary()
        return model
    
    # 정책신경망을 업데이트 하기 위한 오류함수와 훈련함수의 생성
    def optimizer(self):
        with tf.GradientTape() as tape:
        # 크로스 엔트로피 오류함수 계산
            action_prob = K.sum(self.actions * self.model.output, axis=1)
            cross_entropy = K.log(action_prob) * self.discount_reward(self.rewards)
            loss = -K.sum(cross_entropy)
            grads=tape.gradient(loss,self.model.trainable_variables)
        # 정책신경망을 업데이트하는 훈련함수 생성
        optimizer = keras.optimizers.Adam(lr=self.learning_rate)
        optimizer.apply_gradients(zip(grads,self.model.trainable_variables))

        #return train

    # 정책신경망으로 행동 선택
    def get_action(self, state):
        policy = self.model.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    # 반환값 계산
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    # 한 에피소드 동안의 상태, 행동, 보상을 저장
    def append_sample(self, state, action, reward):
        self.states.append(state[0])
        self.rewards.append(reward)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)

    # 정책신경망 업데이트
    def train_model(self):
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)

        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []

In [4]:
env = Env()
agent = ReinforceAgent()

global_step = 0
scores, episodes = [], []

for e in range(EPISODES):
    done = False
    score = 0
        # env 초기화
    state = env.reset()
    state = np.reshape(state, [1, 15])

    while not done:
        global_step += 1
            # 현재 상태에 대한 행동 선택
        action = agent.get_action(state)
            # 선택한 행동으로 환경에서 한 타임스탭 진행 후 샘플 수집
        next_state, reward, done = env.step(action)
        next_state = np.reshape(next_state, [1, 15])

        agent.append_sample(state, action, reward)
        score += reward
        state = copy.deepcopy(next_state)

        if done:
                # 에피소드마다 정책신경망 업데이트
            agent.train_model()
            scores.append(score)
            episodes.append(e)
            score = round(score,2)
            print("episode:", e, "  score:", score, "  time_step:",
                      global_step)

        # 100 에피소드마다 학습 결과 출력 및 모델 저장
    if e % 100 == 0:
        pylab.plot(episodes, scores, 'b')
        pylab.savefig("./save_graph/reinforce.png")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                384       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 125       
Total params: 1,109
Trainable params: 1,109
Non-trainable params: 0
_________________________________________________________________


AttributeError: 'ReinforceAgent' object has no attribute 'actions'