# A2C (Advantage Actor Critic)
---
## 환경
* state : discrete[16]
* action : discrete[4]
* 목표에 도달 시 +1 점, 구멍에 빠질 때 -1 점, 나머지 0 점.

## 환경
* input : state (array[16])
* output : Q (prob)
---
* A2C 알고리즘은 Critic 을 도입한 Policy Gradient 알고리즘이다
* 기본적으로 on-policy 알고리즘이기 때문에 자기상관도에 의해 성능이 떨어진다.
* 이는 A3C 나 다른 off-policy 기법(Importance Sampling)을 사용해야 한다.
* 혹은 batch 방식이용

In [12]:
import numpy as np
import tensorflow as tf
import gym
from gym.envs.registration import register
from keras.layers import Dense
from keras.models import Sequential
from keras import backend as K
from keras.optimizers import Adam
import random

In [34]:
'''
환경 생성
'''
env = gym.make('CartPole-v1')

In [43]:
save_file = 'C:\\Users\\김민수\\Documents\\GitHub\\RL\\vars\\cartpole_pg'
lr = .0001 ## learning rate
total_episode = 1000
epsilon = 1
gamma = .95  ##Discounted factor

In [44]:
# Actor Network
size_in = env.observation_space.shape[0]
size_out = env.action_space.n
size_w1 = 12
size_w2 = 18
size_w3 = 8

Advantage = tf.placeholder(tf.float32)
STATE_IN = tf.placeholder(tf.float32, [None, size_in])
A_1 = tf.Variable(tf.random_normal([size_in, size_w1],stddev=.01), name='A_1')
A_2 = tf.Variable(tf.random_normal([size_w1, size_w2],stddev=.01), name='A_2')
A_3 = tf.Variable(tf.random_normal([size_w2, size_w3],stddev=.01), name='A_3')
out = tf.Variable(tf.random_normal([size_w3, size_out],stddev=.01), name='out')

L_1 = (tf.matmul(STATE_IN, A_1))
L_2 = (tf.matmul(L_1, A_2))
L_3 = (tf.matmul(L_2, A_3))
L_out = tf.sigmoid(tf.matmul(L_3, out))

loss = -tf.reduce_mean(tf.log(L_out) * Advantage)
train = tf.train.AdamOptimizer(lr).minimize(loss)

In [45]:
# Critic Network
# 출력은 single 실수값 value
targetV = tf.placeholder(tf.float32)
STATE_IN_c = tf.placeholder(tf.float32, [None, size_in])
C_1 = tf.Variable(tf.random_normal([size_in, size_w1],stddev=.01), name='C_1')
C_2 = tf.Variable(tf.random_normal([size_w1, size_w2],stddev=.01), name='C_2')
C_3 = tf.Variable(tf.random_normal([size_w2, size_w3],stddev=.01), name='C_3')
out_c = tf.Variable(tf.random_normal([size_w3, 1],stddev=.01), name='out')

Lc_1 = (tf.matmul(STATE_IN_c, C_1))
Lc_2 = (tf.matmul(Lc_1, C_2))
Lc_3 = (tf.matmul(Lc_2, C_3))
Lc_out = tf.sigmoid(tf.matmul(Lc_3, out_c))

loss_c = tf.reduce_mean(tf.square(Lc_out - targetV))
train_c = tf.train.AdamOptimizer(lr).minimize(loss_c)

In [None]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    state = env.reset()
    ep = 1
    step = 0
    total_reward = 0
    
    while ep < total_episode:
        step += 1
        #env.render()
        state = np.reshape(state, newshape=[1, size_in])
        Q_prev = sess.run(L_out, feed_dict={STATE_IN:state})
        
        # actor 로 액션선택
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_prev)
            
        new_state, r, d, _ = env.step(action)
        new_state = np.reshape(new_state, newshape=[1, size_in])
        
        # value 계산
        value = sess.run(Lc_out, feed_dict={STATE_IN_c:state})[0][0]
        value_next = sess.run(Lc_out, feed_dict={STATE_IN_c:new_state})[0][0]
        
        # 도중에 쓰러지면 punish
        if d:
            advantage = -100
            target = -100        # - 보상(벌) 을 주어서 그 액션은 하지 않도록 강화한다.
            epsilon = 1 / ((ep/100) + 1)
            print("episode {}, score : {} ".format(ep, step))
            step = 0
            ep += 1
            state = env.reset()
        # advantage 와 targetV 계산
        else:
            target = r + gamma * value_next
            advantage = r + gamma * value_next - value            
            state = new_state
        
        if(np.shape(state) != (1, 4)):
            state = np.reshape(state, newshape=[1, size_in])
        
        # Actor 와 Critic 함께 학습.
        sess.run(train, feed_dict={STATE_IN:state, Advantage:advantage})
        sess.run(train_c, feed_dict={STATE_IN_c:state, targetV:target})
            
env.close()

episode 1, score : 33 
episode 2, score : 25 
episode 3, score : 56 
episode 4, score : 11 
episode 5, score : 26 
episode 6, score : 14 
episode 7, score : 15 
episode 8, score : 54 
episode 9, score : 33 
episode 10, score : 29 
episode 11, score : 18 
episode 12, score : 15 
episode 13, score : 20 
episode 14, score : 8 
episode 15, score : 18 
episode 16, score : 19 
episode 17, score : 16 
episode 18, score : 39 
episode 19, score : 20 
episode 20, score : 20 
episode 21, score : 20 
episode 22, score : 10 
episode 23, score : 13 
episode 24, score : 41 
episode 25, score : 27 
episode 26, score : 91 
episode 27, score : 28 
episode 28, score : 36 
episode 29, score : 26 
episode 30, score : 15 
episode 31, score : 49 
episode 32, score : 23 