In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

In [2]:
ENV = 'CartPole-v0'
GAMMA = 0.99 # 시간할인율
MAX_STEPS = 200 # 1에피소드당 최대 단계 수
NUM_EPISODES = 1000 # 최대 에피소드 수

NUM_PROCESSES = 32 # 동시 실행 환경 수
NUM_ADVANCED_STEP = 5 # 총 보상을 계산할 때 Advantage 학습을 할 단계수
# A2C 손실함수 계산에 사용되는 상수
value_loss_coef = 0.5
entropy_coef = 0.01
max_grad_norm = 0.5

In [3]:
# Memory class

class RolloutStorage(object):
    '''Advantage 학습에 사용할 메모리 클래스'''
    def __init__(self, num_steps, num_processes, obs_shape):
        self.observations = torch.zeros(num_steps + 1, num_processes, 4)
        self.masks = torch.ones(num_steps + 1, num_processes, 1)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.actions = torch.zeros(num_steps, num_processes, 1).long()
        
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.index = 0
        
    def insert(self, current_obs, action, reward, mask):
        '''현재 인덱스 위치에 transition을 저장'''
        self.observations[self.index + 1].copy_(current_obs)
        self.masks[self.index + 1].copy_(mask)
        self.rewards[self.index].copy_(reward)
        self.actions[self.index].copy_(action)
        
        self.index = (self.index + 1) % NUM_ADVANCED_STEP
        
    def after_update(self):
        '''Advantage 학습 단계만큰 단계가 진행되면 가장 새로운 transition을 indexd0에 저장'''
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])
        
    def compute_returns(self, next_value):
        '''Advantage 학습 범위 안의 각 단계에 대해 할인 총보상을 계산'''
        self.returns[-1] = next_value
        for ad_step in reversed(range(self.rewards.size(0))):
            self.returns[ad_step] = self.returns[ad_step + 1] * GAMMA * self.masks[ad_step + 1] + self.rewards[ad_step]

In [4]:
# Neural Net
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, n_in, n_mid, n_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in, n_mid)
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.actor = nn.Linear(n_mid, n_out) # 행동 결정
        self.critic = nn.Linear(n_mid, 1) # 상태가치 출력
        
    def forward(self, x):
        '''신경망 순전파 계산을 정의'''
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        critic_output = self.critic(h2)
        actor_output = self.actor(h2)
        
        return critic_output, actor_output
    
    def act(self, x):
        value, actor_output = self(x)
        action_probs = F.softmax(actor_output, dim=1)
        action = action_probs.multinomial(num_samples=1)
        
        return action
    
    def get_value(self, x):
        value, actor_output = self(x)
        
        return value
    
    def evaluate_actions(self, x, actions):
        '''상태 x로부터 상태가치, 실제 행동 actions의 고르 확률, 엔트로피 계산'''
        value, actor_output = self(x)
        
        log_probs = F.log_softmax(actor_output, dim=1)
        action_log_probs = log_probs.gather(1, actions)
        
        probs = F.softmax(actor_output, dim=1)
        entropy = -(log_probs * probs).sum(-1).mean()
        
        return value, action_log_probs, entropy

In [5]:
import torch
from torch import optim

class Brain(object):
    def __init__(self, actor_critic):
        self.actor_critic = actor_critic
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=0.01)
        
    def update(self, rollouts):
        obs_shape = rollouts.observations.size()[2:] # torch.Size([4, 84, 84])
        num_steps = NUM_ADVANCED_STEP
        num_processes = NUM_PROCESSES
        
        values, action_log_probs, entropy = self.actor_critic.evaluate_actions(
                                                    rollouts.observations[:-1].view(-1, 4),
                                                    rollouts.actions.view(-1, 1))
        # rollouts.observations[:-1].view(-1, 4) -> torch.Size([80, 4])
        # rollouts.actions.view(-1, 1) -> torch.Size([80, 1])
        # values -> torch.Size([80, 1])
        # actions_log_probs -> torch.Size([80, 1])
        # entropy -> torch.Size([])
        
        values = values.view(num_steps, num_processes, 1) # torch.Size([5, 16, 1])
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
        
        # advantage(행동가치 - 상태가치) 계산
        advantages = rollouts.returns[:-1] - values # torch.Size([5, 16, 1])
        
        # loss of Critic
        value_loss = advantages.pow(2).mean()
        
        action_gain = (action_log_probs * advantages.detach()).mean()
        
        total_loss = (value_loss * value_loss_coef - action_gain - entropy * entropy_coef)
        
        self.actor_critic.train()
        self.optimizer.zero_grad()
        total_loss.backward()
        nn.utils.clip_grad_norm_(self.actor_critic.parameters(), max_grad_norm)
        
        self.optimizer.step()

In [6]:
import copy

class Envionment:
    def run(self):
        '''실행 엔트리 포인트'''
        
        envs = [gym.make(ENV) for i in range(NUM_PROCESSES)]
        
        n_in = envs[0].observation_space.shape[0]
        n_out = envs[0].action_space.n
        n_mid = 32
        actor_critic = Net(n_in, n_mid, n_out)
        global_brain = Brain(actor_critic)
        
        obs_shape = n_in
        current_obs = torch.zeros(NUM_PROCESSES, obs_shape) # torch.Size([16, 4])
        rollouts = RolloutStorage(NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape)
        episode_rewards = torch.zeros([NUM_PROCESSES, 1])
        final_rewards = torch.zeros([NUM_PROCESSES, 1])
        obs_np = np.zeros([NUM_PROCESSES, obs_shape])
        reward_np = np.zeros([NUM_PROCESSES, 1])
        done_np = np.zeros([NUM_PROCESSES, 1])
        each_step = np.zeros(NUM_PROCESSES)
        episode = 0
        
        obs = [envs[i].reset() for i in range(NUM_PROCESSES)]
        obs = np.array(obs)
        obs = torch.from_numpy(obs).float()
        current_obs = obs
        
        rollouts.observations[0].copy_(current_obs)
        
        for j in range(NUM_EPISODES*NUM_PROCESSES):
            for step in range(NUM_ADVANCED_STEP):
                # choice an action
                with torch.no_grad():
                    action = actor_critic.act(rollouts.observations[step])
                    
                actions = action.squeeze(1).numpy()
                
                # 1 step
                for i in range(NUM_PROCESSES):
                    obs_np[i], reward_np[i], done_np[i], _ = envs[i].step(actions[i])
                    
                    # episode의 종료가치, state_next를 설정
                    if done_np[i]:
                        if i == 0:
                            print('%d Episode : Finished after %d steps' %(episode, each_step[i]+1))
                            episode += 1
                            
                        if each_step[i] < 195:
                            reward_np[i] = -1.0
                        else:
                            reward_np[i] = 1.0
                            
                        each_step[i] = 0
                        obs_np[i] = envs[i].reset()
                        
                    else:
                        reward_np[i] = 0.0
                        each_step[i] += 1
                        
                reward = torch.from_numpy(reward_np).float()
                episode_rewards += reward
                
                masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_np])
                
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                
                episode_rewards *= masks
                
                current_obs *= masks
                
                obs = torch.from_numpy(obs_np).float() # torch.Size([16, 4])
                current_obs = obs
                
                # 메모리 객체에 현 단계의 transition을 저장
                rollouts.insert(current_obs, action.data, reward, masks)
                
            # advanced 학습 for 문 끝
            
            # advanced 학습 대상 중 마지막 단계의 상태로 예측하는 상태가치를 계산
            
            with torch.no_grad():
                next_value = actor_critic.get_value(rollouts.observations[-1]).detach()
                # rollouts.observations -> torch.Size([6, 16, 4])
                
            rollouts.compute_returns(next_value)
            
            global_brain.update(rollouts)
            rollouts.after_update()
            
            if final_rewards.sum().numpy() >= NUM_PROCESSES:
                print('success')
                break

In [8]:
cartpole_env = Envionment()
cartpole_env.run()

0 Episode : Finished after 37 steps
1 Episode : Finished after 14 steps
2 Episode : Finished after 26 steps
3 Episode : Finished after 19 steps
4 Episode : Finished after 23 steps
5 Episode : Finished after 25 steps
6 Episode : Finished after 15 steps
7 Episode : Finished after 19 steps
8 Episode : Finished after 16 steps
9 Episode : Finished after 14 steps
10 Episode : Finished after 17 steps
11 Episode : Finished after 200 steps
12 Episode : Finished after 75 steps
13 Episode : Finished after 148 steps
14 Episode : Finished after 200 steps
15 Episode : Finished after 200 steps
16 Episode : Finished after 197 steps
17 Episode : Finished after 200 steps
18 Episode : Finished after 10 steps
19 Episode : Finished after 70 steps
20 Episode : Finished after 9 steps
21 Episode : Finished after 11 steps
22 Episode : Finished after 11 steps
23 Episode : Finished after 14 steps
24 Episode : Finished after 118 steps
25 Episode : Finished after 200 steps
26 Episode : Finished after 17 steps
27 E