# A2C (Advantage Actor Critic)
---
## 환경
* 지속적인 보상
* 잘 안되는 듯 했지만 epsilon 조절을 하니 잘 된다.
* decaying epsilon 매우 좋다??
* 모험을 하는 것은 가끔 폭망할수도 있지만 장기적으로 더 잘될수 있음.
* 그러나 결과가 상황에 따라 너무 다르다.
* 잘될때는 꾸준하게 올라가고 안될때는 처음부터 망하거나 잘되다가 쭉 망함.
* 미친듯이 안되는거 같은데 쭉 놔두면 될때도 있다.
* 신경망 사이즈에 따라 학습할 수 있는 횟수 영향받는 듯
* 어쩔수 없다. 확률적인거라서 핸들링할 수 없다.

In [274]:
import numpy as np
import tensorflow as tf
import gym
from gym.envs.registration import register
from keras.layers import Dense
from keras.models import Sequential
from keras import backend as K
from keras.optimizers import Adam
import random

In [293]:
'''
환경 생성
'''
env = gym.make('CartPole-v1')

In [314]:
save_file = 'C:\\Users\\김민수\\Documents\\GitHub\\RL\\vars\\cartpole_pg'
lr = .01 ## learning rate
total_episode = 1000
epsilon = 1
gamma = .95  ##Discounted factor

In [315]:
# Actor Network
size_in = env.observation_space.shape[0]
size_out = env.action_space.n
size_w1 = 10
size_w2 = 16
size_w3 = 8

Advantage = tf.placeholder(tf.float32)
STATE_IN = tf.placeholder(tf.float32, [None, size_in])
A_1 = tf.Variable(tf.random_normal([size_in, size_w1],stddev=.01), name='A_1')
A_2 = tf.Variable(tf.random_normal([size_w1, size_w2],stddev=.01), name='A_2')
A_3 = tf.Variable(tf.random_normal([size_w2, size_w3],stddev=.01), name='A_3')
out = tf.Variable(tf.random_normal([size_w3, size_out],stddev=.01), name='out')

L_1 = (tf.matmul(STATE_IN, A_1))
L_2 = (tf.matmul(L_1, A_2))
L_3 = (tf.matmul(L_2, A_3))
L_out = tf.sigmoid(tf.matmul(L_3, out))

loss = -tf.reduce_mean(tf.log(L_out) * Advantage) ##pg는 신경망을 거치면 정책(pi)이 바로 나온다.
train = tf.train.AdamOptimizer(lr).minimize(loss)

In [316]:
# Critic Network
targetV = tf.placeholder(tf.float32)
STATE_IN_c = tf.placeholder(tf.float32, [None, size_in])
C_1 = tf.Variable(tf.random_normal([size_in, size_w1],stddev=.01), name='C_1')
C_2 = tf.Variable(tf.random_normal([size_w1, size_w2],stddev=.01), name='C_2')
C_3 = tf.Variable(tf.random_normal([size_w2, size_w3],stddev=.01), name='C_3')
out_c = tf.Variable(tf.random_normal([size_w3, 1],stddev=.01), name='out')

Lc_1 = (tf.matmul(STATE_IN_c, C_1))
Lc_2 = (tf.matmul(Lc_1, C_2))
Lc_3 = (tf.matmul(Lc_2, C_3))
Lc_out = tf.tanh(tf.matmul(Lc_3, out_c))
#Lc_out = tf.matmul(Lc_3, out)

loss_c = tf.reduce_mean(tf.square(Lc_out - targetV))
train_c = tf.train.AdamOptimizer(lr).minimize(loss_c)

In [317]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    state = env.reset()
    ep = 1
    step = 0
    total_reward = 0
    
    
    while ep < total_episode:
        step += 1
        #env.render()
        state = np.reshape(state, newshape=[1, size_in])
        Q_prev = sess.run(L_out, feed_dict={STATE_IN:state})
        
        # actor 로 액션선택
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_prev)
            
        new_state, r, d, _ = env.step(action)
        new_state = np.reshape(new_state, newshape=[1, size_in])
        
        # value
        value = sess.run(Lc_out, feed_dict={STATE_IN_c:state})[0][0]
        value_next = sess.run(Lc_out, feed_dict={STATE_IN_c:new_state})[0][0]
        
        if d:
            advantage = -100
            target = -100        # - 보상(벌) 을 주어서 그 액션은 하지 않도록 강화한다.
            epsilon = 1 / ((ep/100) + 1)
            print("episode {}, score : {} ".format(ep, step))
            step = 0
            ep += 1
            state = env.reset()
        else:
            target = r + gamma * value_next
            advantage = r + gamma * value_next - value            
            state = new_state
        
        # 학습
        if(np.shape(state) != (1, 4)):
            state = np.reshape(state, newshape=[1, size_in])
        
        sess.run(train, feed_dict={STATE_IN:state, Advantage:advantage})
        sess.run(train_c, feed_dict={STATE_IN_c:state, targetV:target})
            
env.close()

episode 1, score : 37 
episode 2, score : 23 
episode 3, score : 11 
episode 4, score : 13 
episode 5, score : 17 
episode 6, score : 24 
episode 7, score : 14 
episode 8, score : 10 
episode 9, score : 102 
episode 10, score : 51 
episode 11, score : 40 
episode 12, score : 32 
episode 13, score : 25 
episode 14, score : 27 
episode 15, score : 13 
episode 16, score : 20 
episode 17, score : 12 
episode 18, score : 17 
episode 19, score : 43 
episode 20, score : 14 
episode 21, score : 12 
episode 22, score : 14 
episode 23, score : 12 
episode 24, score : 21 
episode 25, score : 32 
episode 26, score : 19 
episode 27, score : 29 
episode 28, score : 34 
episode 29, score : 26 
episode 30, score : 23 
episode 31, score : 26 
episode 32, score : 50 
episode 33, score : 20 
episode 34, score : 17 
episode 35, score : 34 
episode 36, score : 44 
episode 37, score : 31 
episode 38, score : 11 
episode 39, score : 17 
episode 40, score : 11 
episode 41, score : 15 
episode 42, score : 12 


KeyboardInterrupt: 