# Pendulum
---
>action 과 state 가 모두 continuous 한 환경이다. 목표는 막대를 수직으로 오래 유지하는 것이다. 종료조건은 따로 없다. 최대 시간을 두는 것이 좋을 수도 있다.


* state : 막대의 sin, cos, 각속도
* action : 조인트에 작용하는 -2 ~ 2 사이 토크
* reward : -(theta^2 + 0.1*theta_dt^2 + 0.001*action^2)
---

In [1]:
import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque
import dqn

In [3]:
'''
환경 생성
'''
env = gym.make('Pendulum-v0')

In [7]:
size_s = env.observation_space.shape[0]
size_out = env.action_space.shape[0]
gamma = .95
print(size_s)

3


In [4]:
'''
학습 네트워크 -> target 네트워크 로 복사하는 함수
''' 
def copy(*, dest_scope_name = 'target', src_scope_name = 'main') :
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars) :
        op_holder.append(dest_var.assign(src_var))
        
    return op_holder


In [6]:
'''
replay experience, 공급된 batch 로 학습
'''
def replay_train(mainDQN, targetDQN, train_batch) :
    x_stack = np.empty(0).reshape(0, size_s)
    y_stack = np.empty(0).reshape(0, size_out)
    
    for state, action, reward, next_state, done in train_batch :
        
        # 현재 Q 는 main 네트워크에서 도출
        Q = mainDQN.predict(state)
        
        # terminal step 이면 음의 보상
        if done :
            Q[0, action] = -100
        
        # target Q 는 target 네트워크에서 도출 - 자기상관도 감소 (Double Q 러닝 기법이 포함된듯)
        else :
            Q[0, action] = reward + gamma*np.max(targetDQN.predict(next_state))
        
        # x 는 현재 Q
        # y 는 target
        x_stack = np.vstack([x_stack, state])
        y_stack = np.vstack([y_stack, Q])
        
    return mainDQN.update(x_stack, y_stack)

In [7]:
total_episode = 1000
replay_buffer = deque()
ep = 0
step = 0
#saver = tf.train.Saver()
#save_file = 'C:\\Users\\김민수\\Documents\\GitHub\\RL\\vars\\cartpole_dqn'
mean_reward = deque()

with tf.Session() as sess:
      
    '''
    - 네트워크 2 개 생성
    - 하나는 실제 학습을 하는 학습 네트워크
    - 다른 하나는 일시적으로 고정되어 학습의 목표가 되는 target 네트워크
    '''
    mainDQN = dqn.DQN(sess, size_s, size_out, 8, 6, name='main')
    targetDQN = dqn.DQN(sess, size_s, size_out, 8, 6, name='target')
    
    tf.global_variables_initializer().run()
    
    # 처음 시작 시 네트워크 복사
    copy_ops = copy(dest_scope_name='targetDQN', src_scope_name='mainDQN')
    sess.run(copy_ops)
    
    state = env.reset()
    
    for episode in range(total_episode) :
        e = 1. / ((episode / 20) + 1)    # 점점 감소하는 explore 
        done = False
        step_count = 0
        state = env.reset()
        
        while not done :
             
            # 액션 선택
            if np.random.rand(1) < e :
                action = env.action_space.sample()
            else :
                action = np.argmax(mainDQN.predict(state))
                    
            next_state, reward, done, _ = env.step(action)
                
            if done : 
                reward = -100
                
                
            # 학습하지않고 임시메모리에 저장한다.
            replay_buffer.append((state, action, reward, next_state, done))
            if(len(replay_buffer) > 50000) :
                replay_buffer.popleft()
            
            state = next_state
            step_count += 1
            
            if(step_count > 1000) :
                break
        
        
        print ('episode : {}, steps : {}'.format(episode, step_count))
                
        '''
        파라미터 로컬에 저장.
        '''
        if(len(mean_reward) < 10):
            mean_reward.append(step_count)
        else:
            mean_reward.popleft()
            
            '''
            if(np.mean(mean_reward) > 800):
                saver.save(sess, save_file)
                break
            '''
                
        # 10 회의 에피소드가 끝나면 임시메모리에서 과거 데이터를 랜덤으로 뽑아 학습한다.
        if(episode % 10 == 1) :
            for _ in range(50) :
                minibacth = random.sample(replay_buffer, 10)
                loss, _ = replay_train(mainDQN, targetDQN, minibacth)
                        
            print('loss :', loss)
            sess.run(copy_ops)   # 목표가 되는 네트워크를 업데이트한다.
                    
    env.close()

episode : 0, steps : 21
episode : 1, steps : 27
loss : 445.75748
episode : 2, steps : 20
episode : 3, steps : 24
episode : 4, steps : 16
episode : 5, steps : 11
episode : 6, steps : 24
episode : 7, steps : 14
episode : 8, steps : 12
episode : 9, steps : 11
episode : 10, steps : 10
episode : 11, steps : 10
loss : 506.68994
episode : 12, steps : 35
episode : 13, steps : 14
episode : 14, steps : 46
episode : 15, steps : 28
episode : 16, steps : 88
episode : 17, steps : 40
episode : 18, steps : 31
episode : 19, steps : 55
episode : 20, steps : 39
episode : 21, steps : 53
loss : 1.3592613
episode : 22, steps : 12
episode : 23, steps : 9
episode : 24, steps : 15
episode : 25, steps : 10
episode : 26, steps : 9
episode : 27, steps : 10
episode : 28, steps : 9
episode : 29, steps : 15
episode : 30, steps : 8
episode : 31, steps : 10
loss : 4.49893
episode : 32, steps : 110
episode : 33, steps : 68
episode : 34, steps : 70
episode : 35, steps : 89
episode : 36, steps : 145
episode : 37, steps :