## 초기 라이브러리 및 함수 설정

In [None]:

import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

ENV_NAME = "CartPole-v1" # Game 이름 

GAMMA = 0.95 # Discount Factor
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000 #Replay memory최대 저장 개수
BATCH_SIZE = 20 #memory에 history data가 20개까지 있을 때

EXPLORATION_MAX = 1.0 #최대 exploration 값 
EXPLORATION_MIN = 0.01 #최소 exploration 값
EXPLORATION_DECAY = 0.995 #exploration 값이 작아지게 하기 위해서 



## DQN network 구현

In [16]:

class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX #초반에 Exploration으로 탐색을 하는 비율정의

        self.action_space = action_space # Action Space :2
        self.memory = deque(maxlen=MEMORY_SIZE) # Replay Memory 정의: deque함수로 정의  
        
        ## Feed Foward Network 정의 
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE)) # loss function 및 optimizer 정의

    


## Replay memory 저의

In [11]:
    def remember(self, state, action, reward, next_state, done):
        # agent의 state, action, reward, next_state, 게임이 끝났는지 등을 메모리 버퍼에 저장
        self.memory.append((state, action, reward, next_state, done))

    

## Action 함수 정의

In [12]:
    def act(self, state):
        #exploration_rate가 초반에는 1.0 이지만, 학습을 거급하면서 점점 작아진다.
        if np.random.rand() < self.exploration_rate:
            # Random으로 action을 선택
            return random.randrange(self.action_space)
        # agent가 직접 action을 선택
        q_values = self.model.predict(state)
        # 오른쪽, 왼쪽으로 예측한 값 중에 확률값이 높은것을 선택해서 return 한다
        return np.argmax(q_values[0])

## 학습 Process 정의

In [9]:
    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)# 버퍼에서 batch size만큼 rand
        for state, action, reward, state_next, terminal in batch:
            q_update = reward #현제  state와 action을 했을 때 reward
            if not terminal:# 이 Action을 했을 때 게임이 끝나지 않았다면
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0])) # q_update -> reward를 계산 
            q_values = self.model.predict(state)
            q_values[0][action] = q_update # 기존 action의 q_value값 다시 계산 : 게임이 끝났다면 기존 reward를 끝나지 않았다면 예측한 q_update 값
            self.model.fit(state, q_values, verbose=0) #model 학습 
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)#exploration_rate 줄여나감

## Main 실행 함수

In [13]:

def cartpole():
    env = gym.make(ENV_NAME)
    observation_space = env.observation_space.shape[0] # state의 space를 정의 :4 
    action_space = env.action_space.n # action space 정의: 2
    dqn_solver = DQNSolver(observation_space, action_space) #DQN class 정의
    run = 0
    while True:
        run += 1
        state = env.reset() # 환경값 초기화
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state) #현재 환경에서 state가 주어졌을 때 action
            state_next, reward, terminal, info = env.step(action) # 현재 환경에서 action을 했을 다음 state 값과 reward, info를 줌 
            reward = reward if not terminal else -reward # 게임이 끝났다면 reward는 :-1, 아니라면 : +1
            state_next = np.reshape(state_next, [1, observation_space]) # 다음 state 값 shape type변경
            dqn_solver.remember(state, action, reward, state_next, terminal) # replay memory 저장 
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                break
            dqn_solver.experience_replay()# 학습실행


if __name__ == "__main__":
    cartpole()

AttributeError: 'DQNSolver' object has no attribute 'act'

In [18]:

import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

ENV_NAME = "CartPole-v1" # Game 이름 

GAMMA = 0.95 # Discount Factor
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000 #Replay memory최대 저장 개수
BATCH_SIZE = 20 #memory에 history data가 20개까지 있을 때

EXPLORATION_MAX = 1.0 #최대 exploration 값 
EXPLORATION_MIN = 0.01 #최소 exploration 값
EXPLORATION_DECAY = 0.995 #exploration 값이 작아지게 하기 위해서 


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX #초반에 Exploration으로 탐색을 하는 비율정의

        self.action_space = action_space # Action Space :2
        self.memory = deque(maxlen=MEMORY_SIZE) # Replay Memory 정의: deque함수로 정의  
        
        ## Feed Foward Network 정의 
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])
    
    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

        
def cartpole():
    env = gym.make(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                break
            dqn_solver.experience_replay()


if __name__ == "__main__":
    cartpole()

Run: 1, exploration: 1.0, score: 10
Run: 2, exploration: 0.985074875, score: 13
Run: 3, exploration: 0.9275689688183278, score: 13
Run: 4, exploration: 0.8475428503023453, score: 19
Run: 5, exploration: 0.798065677681905, score: 13
Run: 6, exploration: 0.7219385759785162, score: 21
Run: 7, exploration: 0.5878229785513479, score: 42
Run: 8, exploration: 0.43080185560799106, score: 63
Run: 9, exploration: 0.3936343764094253, score: 19
Run: 10, exploration: 0.16128775296900558, score: 179
Run: 11, exploration: 0.0660860453679829, score: 179
Run: 12, exploration: 0.05169406930342616, score: 50
Run: 13, exploration: 0.04125526245077418, score: 46
Run: 14, exploration: 0.034100160462149656, score: 39
Run: 15, exploration: 0.024991218342344988, score: 63
Run: 16, exploration: 0.01964695745288379, score: 49
Run: 17, exploration: 0.015215016325303928, score: 52
Run: 18, exploration: 0.010874738754866477, score: 68
Run: 19, exploration: 0.01, score: 97
Run: 20, exploration: 0.01, score: 197
Run:

KeyboardInterrupt: 