In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym

In [2]:
#애니메이션을 만드는 함수
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    
    def animate(i):
        patch.set_data(frames[i])
    
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval = 50)
    writergif = animation.PillowWriter(fps=30) 
    anim.save("movie_cartpole.gif",writer=writergif)
    display(display_animation(anim, default_mode='loop'))

In [24]:
#수레를 무작위로 움직임
frames = []
env = gym.make('CartPole-v0')
observation = env.reset() #환경 초기화

for step in range(0,200):
    frames.append(env.render(mode='rgb_array')) #v프레임에 각 시각의 이미지 추가
    action = np.random.choice(2) #0-수레를 왼쪽으로, 1-수레를 오른쪽으로
    observation, reward, done, info = env.step(action) #action실행
    
#애니메이션파일 재생
display_frames_as_gif(frames)

The 'clear_temp' parameter of setup() was deprecated in Matplotlib 3.3 and will be removed two minor releases later. If any parameter follows 'clear_temp', they should be passed as keyword, not positionally.
  frame_prefix, clear_temp=False)


AttributeError: 'HTMLWriter' object has no attribute '_temp_names'

### 상태의 이산변수 변환

In [36]:
#상수 정의
ENV = 'CartPole-v0' #태스크 이름
NUM_DIZITIZED = 6 #각 상태를 이산변수로 변환할 구간 수

#CartPole 실행
env = gym.make(ENV) #실행할 태스크 선택
observation = env.reset() #환경 초기화

#이산값으로 만들 구간 계산
def bins(clip_min, clip_max, num): #관측된 상태(연속값)를 이산값으로 변환하는 구간 계산
    return np.linspace(clip_min, clip_max, num+1)[1:-1]

In [37]:
#관측된 상태를 이산값으로 변환
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    #순서대로 수레의 위치, 수레의 속도, 봉의 각도, 봉의 각속도
    digitized = [
        np.digitize(cart_pos, bins = bins(-2.4, 2.4, NUM_DIZITIZED)),
        np.digitize(cart_v, bins = bins(-3.0, 3.0, NUM_DIZITIZED)),
        np.digitize(cart_pos, bins = bins(-0.5, 0.5, NUM_DIZITIZED)),
        np.digitize(cart_pos, bins = bins(-2.0, 2.0, NUM_DIZITIZED))]
    return sum([x*(NUM_DIZITIZED**i)for i, x in enumerate(digitized)])

### Q-learning 구현
with 3 classes
- Agent: 수레, 'Q함수 수정', '다음 행동 결정'
- Brain: '수레가 관찰한 상태를 이산변수로 변환', 'Q테이블 수정', 'Q테이블을 이용해 행동 결정'
- Environment: OpenAI Gyem이 실행되는 실행환경, 'run'

#### (1) agent가 observation_t를 Brain에 전달
#### (2) Brain은 전달받은 현재 상태를 이산변수로 변환하고, Q테이블을 참조해서 행동을 결정해서, agent에게 행동 전달
#### (3) agent는 environment에 행동action_t를 전달하고 
#### (4) Environment는 다음 상태 observation_t+1과 즉각보상 reward_t+1을 agent에게 반환
#### (5) agent가 전달한, 현재상태, 행동, 다음 상태, 즉각보상 (transition) 정보를 바탕으로, Brain은 Q테이블 수정

In [3]:
#상수 정의
ENV = 'CartPole-v0'
NUM_DIZITIZED = 6 #이산 변수로 변환할 구간 수
GAMMA = 0.99 #시간할인율
ETA = 0.5 #학습률
MAX_STEPS = 200 #한 에피소드당 최대 단계 수
NUM_EPISODES = 1000 #최대 에피소드 수

In [4]:
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions) #agent가 행동을 결정하는 두뇌 역할
        
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_Q_table(observation, action, reward, observation_next)
    
    def get_action(self, observation, step):
        action = self.brain.decide_action(observation, step)
        return action

In [5]:
class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions= num_actions #행동 가짓수(왼 or 오)구함
        self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIZITIZED**num_states, num_actions)) #Q테이블 생성
        
    def bins(self, clip_min, clip_max, num): #상태를 이산변수 구간으로 나누기
        return np.linspace(clip_min, clip_max, num+1)[1:-1]
    
    def digitize_state(self, observation): #관측된 상태를 이산변수로 변환
        cart_pos, cart_v, pole_angle, pole_v = observation
        digitized = [
            np.digitize(cart_pos, bins = self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins = self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins = self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins = self.bins(-2.0, 2.0, NUM_DIZITIZED))
        ]
        x = sum([x*(NUM_DIZITIZED**i)for i, x in enumerate(digitized)])
        return x
        
    def update_Q_table(self,observation, action, reward, observation_next): #Q-leaning : Q테이블 수정
        state = self.digitize_state(observation) #현재 상태를 이산변수로 변환
        state_next = self.digitize_state(observation_next) #다음 상태를 이산변수로 변환
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action]+ ETA*(reward+ GAMMA*Max_Q_next - self.q_table[state,action])
       
    def decide_action(self, observation, episode): #엡실론-그리디 -> 최적 행동 비중을 늘림
        state = self.digitize_state(observation)
        epsilon = 0.5*(1/(episode+1))
        
        if epsilon <= np.random.uniform(0,1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
        return action
        

In [6]:
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0] #테스크의 상태변수 수를 구함
        num_actions = self.env.action_space.n #가능한 행동 수 구함
        self.agent = Agent(num_states, num_actions) #에이전트 객체 생성
    
    def run(self):
        complete_episodes = 0 #성공한 에피소드 수(195이상 단계를 버틴 경우==성공)
        is_episode_final = False #에피소드 마지막인지 여부
        framse = [] #에니메이션 만드는 데 사용할 이미지 저장하는 변수
        
        for episode in range(NUM_EPISODES): #에피소드 수만큼 반복
            observation = self.env.reset() #환경 초기화
            
            for step in range(MAX_STEPS): #한 에피소드씩
                if is_episode_final is True: #마지막 에피소드면 frames에 각 단계의 이미지 저장
                    frames.append(self.env.render(mode='rgb_array'))
                #행동 선택
                action = self.agent.get_action(observation, episode)
                #행동 a_t를 실행해 s_t+1, r_t+1 계산
                observation_next, _, done, _ = self.env.step(action)
                 
                if done: #200단계 넘거나 일정 각도 이상 기울면 done이 참
                    if step <195:
                        reward = -1 #봉이 쓰러지면 패널티 -1
                        complete_episodes = 0 #195 이상 버티면 성공처리
                    else:
                        reward = 1
                        complete_episodes += 1
                    
                else: 
                    reward = 0 #에피 소드 중에는 보상 없다
                 
                #다음단계 상태 observation_next로 Q함수 수정
                self.agent.update_Q_function(observation, action, reward, observation_next)
                
                #다음단계 상태 관측
                observation = observation_next
                
                #에피소드 마무리
                if done:
                    print('{0} Episode: Finished after {1} time steps'.format(episode, step+1))
                    break
            
            if is_episode_final is True: 
                display_frames_as_gif(frames)
                break
            
            if complete_episodes >= 10: 
                print('10 에피소드 연속 성공')
                is_episode_final = True #다음 에피소드가 마지막이됨

In [7]:
#main
cartpole_env = Environment()
cartpole_env.run()

0 Episode: Finished after 15 time steps
1 Episode: Finished after 21 time steps
2 Episode: Finished after 13 time steps
3 Episode: Finished after 10 time steps
4 Episode: Finished after 14 time steps
5 Episode: Finished after 9 time steps
6 Episode: Finished after 21 time steps
7 Episode: Finished after 13 time steps
8 Episode: Finished after 10 time steps
9 Episode: Finished after 24 time steps
10 Episode: Finished after 45 time steps
11 Episode: Finished after 10 time steps
12 Episode: Finished after 10 time steps
13 Episode: Finished after 13 time steps
14 Episode: Finished after 31 time steps
15 Episode: Finished after 31 time steps
16 Episode: Finished after 35 time steps
17 Episode: Finished after 43 time steps
18 Episode: Finished after 38 time steps
19 Episode: Finished after 15 time steps
20 Episode: Finished after 73 time steps
21 Episode: Finished after 69 time steps
22 Episode: Finished after 39 time steps
23 Episode: Finished after 110 time steps
24 Episode: Finished after

196 Episode: Finished after 200 time steps
197 Episode: Finished after 140 time steps
198 Episode: Finished after 200 time steps
199 Episode: Finished after 99 time steps
200 Episode: Finished after 60 time steps
201 Episode: Finished after 66 time steps
202 Episode: Finished after 200 time steps
203 Episode: Finished after 141 time steps
204 Episode: Finished after 126 time steps
205 Episode: Finished after 109 time steps
206 Episode: Finished after 81 time steps
207 Episode: Finished after 200 time steps
208 Episode: Finished after 141 time steps
209 Episode: Finished after 117 time steps
210 Episode: Finished after 85 time steps
211 Episode: Finished after 122 time steps
212 Episode: Finished after 200 time steps
213 Episode: Finished after 88 time steps
214 Episode: Finished after 148 time steps
215 Episode: Finished after 133 time steps
216 Episode: Finished after 44 time steps
217 Episode: Finished after 128 time steps
218 Episode: Finished after 83 time steps
219 Episode: Finish

The 'clear_temp' parameter of setup() was deprecated in Matplotlib 3.3 and will be removed two minor releases later. If any parameter follows 'clear_temp', they should be passed as keyword, not positionally.
  frame_prefix, clear_temp=False)


AttributeError: 'HTMLWriter' object has no attribute '_temp_names'