# CartPole
---
>continous 한 action 환경 중에서 가장 단순한 환경이다. 매 프레임마다 reward 를 제공하는 dense reward 환경이라서 때문에 학습이 매우 용이하다. 
또한 terminal condition 은 카트가 일정 범위 넘어가거나 막대가 일정 각도 이상되는 것이기 때문에 명확하다. DQN 으로 할 때 step 마다 학습하지 않고 데이터만 쌓는다. 나중에 학습과 타겟 신경망 업데이트 하는것이 최적



* state : 카트 위치, 카트 속력, 막대 각도, 막대 각속도 - continuous
* action : 카트를 오른쪽/왼쪽으로 push
* reward : 매 프레임마다 +1, episode 가 끝날때 임의로 보상제공 가능
---

In [1]:
from keras.layers import Dense
from keras.models import Sequential
import numpy as np
import gym
from gym.envs.registration import register
from keras import optimizers
from keras import initializers
import random
from collections import deque

Using TensorFlow backend.


In [2]:
'''
환경 생성
'''
env = gym.make('CartPole-v1')

In [3]:
class DQN_Agent:
    def __init__(self, n_state, n_action):
        self.n_state = n_state
        self.n_action = n_action
        
        self.gamma = .95
        self.lr = .01
        self.epsilon = .2
        self.batch_size = 16
        self.train_start = 1000
        
        self.memory = deque()
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        
        self.update_target_model()
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(12, input_dim=env.observation_space.shape[0], activation='tanh'))
        model.add(Dense(8, activation='tanh'))
        #model.add(Dense(8))
        model.add(Dense(env.action_space.n, activation='tanh'))
        
        #model.summary()
        model.compile(loss='mse', optimizer='adam')
    
        return model
        
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def append_sample(self, state, actoin, reward, state_next, done):
        self.memory.append((state, action, reward, state_next, done))
    
    def train_model(self):
        mini_batch = random.sample(self.memory, self.batch_size)
        states = np.zeros((self.batch_size, self.n_state))
        states_next = np.zeros((self.batch_size, self.n_state))
        actions, rewards, dones = [], [], []
        
        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            states_next[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
            
        target = self.model.predict(states)
        target_val = self.target_model.predict(states_next)
        
        for i in range(self.batch_size):
                target[i][actions[i]] = rewards[i] + self.gamma * (np.argmax(target_val[i]))
                
        self.model.fit(states, target, batch_size=self.batch_size, epochs=1, verbose=0)

In [4]:
episode = 0
max_episode = 1000

state = env.reset()
action = env.action_space.sample()
step = 0

agent = DQN_Agent(env.observation_space.shape[0], env.action_space.n)

for ep in range(max_episode):
    
    state = env.reset()
    state = np.reshape(state, [1, 4])
    done = False
    #agent.epsilon -= .001
    agent.epsilon = 1. / ((episode / 10) + 1)
    
    while not done:
        
        # action 선택
        if(random.random() > agent.epsilon):
            action = agent.model.predict(state)
            action = np.argmax(action)
            
        else:
            action = env.action_space.sample()
        
        # step 진행
        state_next, reward, done, _ = env.step(action)
        state_next = np.reshape(state_next, [1, 4])
        # 실패 경우 보상 처리
        if(done):
            reward = -100
        
        # sample 저장
        agent.append_sample(state, action, reward, state_next, done)
        if(len(agent.memory) > 50000) :
                agent.memory.popleft()
    
    # 10 에피소드마다 학습 및 업데이트
    if(ep % 5 == 0):
        agent.train_model()
        agent.update_target_model()
    
env.close()

In [5]:
for ep in range(1000):
    done = False
    s = env.reset()
    reward = 0
    
    while(True):
        #env.render()
        s = np.reshape(s, [1, 4])
        a = agent.model.predict(s)
        #print(a)
        a = np.argmax(a)
        s, r, d, _ = env.step(a)
        reward += r
        if(d):
            print(ep, reward)
            reward = 0
            break
env.close()

0 9.0
1 10.0
2 8.0
3 11.0
4 10.0
5 9.0
6 10.0
7 11.0
8 8.0
9 8.0
10 9.0
11 11.0
12 8.0
13 10.0
14 9.0
15 9.0
16 10.0
17 9.0
18 8.0
19 10.0
20 10.0
21 9.0
22 10.0
23 10.0
24 9.0
25 10.0
26 9.0
27 10.0
28 8.0
29 9.0
30 9.0
31 9.0
32 10.0
33 9.0
34 9.0
35 8.0
36 8.0
37 10.0
38 9.0
39 10.0
40 9.0
41 10.0
42 8.0
43 9.0
44 9.0
45 10.0
46 10.0
47 9.0
48 9.0
49 9.0
50 9.0
51 8.0
52 9.0
53 9.0
54 11.0
55 9.0
56 8.0
57 9.0
58 10.0
59 9.0
60 9.0
61 9.0
62 8.0
63 10.0
64 10.0
65 10.0
66 10.0
67 8.0
68 8.0
69 9.0
70 11.0
71 10.0
72 10.0
73 9.0
74 9.0
75 9.0
76 10.0
77 8.0
78 10.0
79 9.0
80 8.0
81 10.0
82 9.0
83 9.0
84 10.0
85 10.0
86 9.0
87 10.0
88 9.0
89 10.0
90 9.0
91 10.0
92 10.0
93 10.0
94 10.0
95 8.0
96 10.0
97 10.0
98 10.0
99 9.0
100 11.0
101 9.0
102 10.0
103 10.0
104 11.0
105 10.0
106 8.0
107 9.0
108 8.0
109 10.0
110 9.0
111 10.0
112 9.0
113 8.0
114 9.0
115 10.0
116 9.0
117 10.0
118 9.0
119 8.0
120 10.0
121 10.0
122 9.0
123 9.0
124 8.0
125 9.0
126 9.0
127 9.0
128 10.0
129 10.0
130 10.0
131 9