In [4]:
import gym
import tensorflow as tf
import numpy as np
import argparse

#녹화한 영상을 저장할 디렉토리 생성
OUT_DIR = 'cartpole-experiment'
#마지막 몇 개의 Score를 평균을 내서 활용할지
MAX_SCORE_QUEUE_SIZE = 100
#게임 종류
GAME ='CartPole-v0'

def get_options():
    parser = AgumentParser()
    parser.add_argument('--MAX_EPISODE',type=int,default=3000,
                        help="max number of episodes iteration")
    
    parser.add_argument('--ACTION_DIM', type=int, default=2,
                        help='number of actions one can take')
    
    parser.add_argument('--OBSERVATION_DIM', type=int, default=4,
                        help='number of observations one can see')
    
    parser.add_argument('--GAMMA', type=float, default=0.9,
                        help='discount factor of Q learning')
    
    parser.add_argument('--INIT_EPS', type=float, default=1.0,
                        help='initial probability for randomly sampling action')
    
    parser.add_argument('--FINAL_EPS', type=float, default=1e-5,
                        help='finial probability for randomly sampling action')
    
    parser.add_argument('--EPS_DECAY', type=float, default=0.95
                        ,help='epsilon decay rate')
    
    parser.add_argument('--EPS_ANNEAL_STEPS', type=int, default=10,
                        help='steps interval to decay epsilon')
    
    parser.add_argument('--LR', type=float, default=1e-4,
                        help='learning rate')
    
    parser.add_argument('--MAX_EXPERIENCE', type=int, default=2000,
                        help='size of experience replay memory')
    
    parser.add_argument('--BATCH_SIZE', type=int, default=256,
                        help='mini batch size')
    
    parser.add_argument('--H1_SIZE', type=int, default=128,
                        help='size of hidden layer 1')
    
    parser.add_argument('--H2_SIZE', type=int, default=128,
                        help='size of hidden layer 2')
    
    parser.add_argument('--H3_SIZE', type=int, default=128,
                        help='size of hidden layer 3')
    
    options = parser.parse_args()
    
    return options

#hidden layer는 총 3개, 각 layer 128개의 node들로 구성, 2개는 ReLu함수 마지막 output은 함수 tf의 Squeeze함수사용

class QAjent:
    
    def __init__(self,option):
        
        #3개의 히든 layer 갖게 만듦
        #default로 각각 128개의 hidden unit을 갖게 됨
        
        self.W1=self.weight_variable([options.OBSERVATION_DIM,options.H1_SIZE])
        self.b1=self.bias_variable([options.H1_SIZE])
        self.W2=self.weight_variable([options.H1_SIZE,options.H2_SIZE])
        self.b2=self.bias_variable([options.H2_SIZE])
        self.W3=self.weight_variable([options.H3_SIZE,options.H3_SIZE])      
        self.W4=self.weight_variable([options.H3_SIZE,options.ACTION_DIM])
        self.b4=self.bias_variable([options.ACTION_DIM])
        
    #W1,W2,W3의 행렬들을 초기화 해주는 단계    
    def xavier_initializer(self,shape):
        dim_sum =np.sum(shape)
        if len(shape)==1:
            dim_sum+=1
        bound = np.sqrt(6.0/dim_sum)
        #tf.random.uniform(shape, minval=0, maxval=None, dtype=tf.dtypes.float32, seed=None, name=None)
        return tf.random.uniform(shape,minval=-bound,maxval=bound)
    
    def weight_variable(self,shape):
        return tf.Variable(self.xavier_initializer(shape))
    
    def bias_variable(self,shape):
        return tf.Variable(self.xavier_initializer(shape))
    
    #nn 세부사항 정의 , 마지막에는 squeeze, 나머지는 relu 함수 이용
    def add_value_net(self,options):
        observation = tf.placeholder(tf.float32,[None,options.OBSERVATION_DIM])
        
        h3 = tf.nn.relu(tf.matmul(h2,self.W3)+self.b3)
        Q = tf.squeeze(tf.matmul(h3,self.W4)+self.b4)
        return observation,Q
    
    #Q,feed,eps,options를 받아와서 action을 정해줌
    #처음 epsilon은 1
    def sample_action(self,Q,feed,eps,options):
        act_values=Q.eval(feed_dict=deed)
        
        if random.random() <= eps:
            #eps값보다 작으면 action범위 내에값을 하나 뽑아줌
            action_index =random.randrange(options.ACTION_DIM)
            
        else:
            #index 최대값, Q_value중 좀 더 큰 값을 갖는 action을 선택
            action_index = np.argmax(act_values)
        
        action = np.zeros(options.ACTION_DIM)
        action[action_index]=1
        return action
    
    
    
def train(env):
    
    options = get_obtions()
    agent =QAgent(options)
    sess = tf.compat.v1.InteractiveSession()
    
    obs, Q1 = agent.add_value_net(options)
    act = tf.placeholder(tf.float32, [None, options.ACTION_DIM])
    rwd = tf.placeholder(tf.float32, [None, ])
    next_obs,Q2 = agent.add_value_net(options)