## 7章　播放打磚塊遊戲Breakout的程式

In [1]:
# import套件
import numpy as np
from collections import deque
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym import spaces
from gym.spaces.box import Box


In [2]:
# import套件
import matplotlib.pyplot as plt
%matplotlib inline


# 宣告動畫的繪製函數
# 参考URL http://nbviewer.jupyter.org/github/patrickmineault
# /xcorr-notebooks/blob/master/Render%20OpenAI%20gym%20as%20GIF.ipynb
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1]/72.0*1, frames[0].shape[0]/72.0*1),
               dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')
 
    def animate(i):
        patch.set_data(frames[i])
 
    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),
                                   interval=20)
 
    anim.save('breakout.mp4')  # 命令與儲存動畫檔案
    display(display_animation(anim, default_mode='loop'))
    

In [3]:
# 設定執行環境
# 参考：https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py

import cv2
cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        '''這部分是作業1的No-Operation。要在遊戲重設之後的幾個步驟內，不進行任何操作、
        讓遊戲正常初始化，避免在特定的初始狀態學習'''

        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(
                1, self.noop_max + 1)  # pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        '''這是作業2的Episodic Life。掉一顆球之後重設，並從失敗的狀態開始下次的學習'''
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
         '''漏接5顆球之後，徹底重設'''
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        '''這是作業3的Max and Skip。連續4格影格執行相同的動作、將最後4格影格的影像轉換成obs'''
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros(
            (2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        '''這是作業4的Warp frame。將影像轉換成Nature的DQN論文的84x84黑白影像'''
        gym.ObservationWrapper.__init__(self, env)
        self.width = 84
        self.height = 84
        self.observation_space = spaces.Box(low=0, high=255,
                                            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height),
                           interpolation=cv2.INTER_AREA)
        return frame[:, :, None]


class WrapPyTorch(gym.ObservationWrapper):
    def __init__(self, env=None):
        '''依照PyTorch的小批次索引順序變更的包裹器'''
        super(WrapPyTorch, self).__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = Box(
            self.observation_space.low[0, 0, 0],
            self.observation_space.high[0, 0, 0],
            [obs_shape[2], obs_shape[1], obs_shape[0]],
            dtype=self.observation_space.dtype)

    def observation(self, observation):
        return observation.transpose(2, 0, 1)


In [4]:
# 播放專用的執行環境


class EpisodicLifeEnvPlay(gym.Wrapper):
    def __init__(self, env):
        '''這是作業2的Episodic Life。掉一顆球之後重設，並從失敗的狀態開始下次的學習
        這裡是用於播放學習結果的程式，所以在掉一顆球之後重設遊戲，也在此時重設磚塊的狀態'''

        gym.Wrapper.__init__(self, env)

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        # 生命數（球數）一開始有5個，但只要減少一個就結束程式
        if self.env.unwrapped.ale.lives() < 5:
            done = True

        return obs, reward, done, info

    def reset(self, **kwargs):
        '''即使只漏接一次也徹底重設'''

        obs = self.env.reset(**kwargs)

        return obs


class MaxAndSkipEnvPlay(gym.Wrapper):
    def __init__(self, env, skip=4):
        '''這是作業3的Max and Skip。連續4格影格執行相同的動作、將最後的4格影格的影像轉換成obs'''
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros(
            (2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break

        return obs, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


In [5]:
# 定義產生執行環境的函數

# 多工執行環境
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv


def make_env(env_id, seed, rank):
    def _thunk():
        '''_thunk()是執行多工環境SubprocVecEnv所需的函數'''

        env = gym.make(env_id)
        #env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env.seed(seed + rank)  # 設定亂數種子
        #env = EpisodicLifeEnv(env)
        env = EpisodicLifeEnvPlay(env)
        env = WarpFrame(env)
        env = WrapPyTorch(env)

        return env

    return _thunk


def make_env_play(env_id, seed, rank):
    '''播放專用的執行環境'''
    env = gym.make(env_id)
    #env = NoopResetEnv(env, noop_max=30)
    #env = MaxAndSkipEnv(env, skip=4)
    env = MaxAndSkipEnvPlay(env, skip=4)
    env.seed(seed + rank)  # 設定亂數種子
    env = EpisodicLifeEnvPlay(env)
    #env = WarpFrame(env)
    #env = WrapPyTorch(env)

    return env


In [6]:
# 設定常數

ENV_NAME = 'BreakoutNoFrameskip-v4' 
# 使用的不是Breakout-v0，而是BreakoutNoFrameskip-v4
# v0會隨機跳過2-4影格，這次使用的是不會隨機跳過影格的版本
# 参考URL https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26
# https://github.com/openai/gym/blob/5cb12296274020db9bb6378ce54276b31e7002da/gym/envs/__init__.py#L371
    
NUM_SKIP_FRAME = 4 # skip的frame數
NUM_STACK_FRAME = 4  # 連續儲存為狀態的frame數
NOOP_MAX = 30  #  reset之際，不執行任何作業的影格（No-operation）的前後影格數的亂數上限
NUM_PROCESSES = 16 #  多工執行的程序數
NUM_ADVANCED_STEP = 5  # 設定執行幾步就要加總一次報酬
GAMMA = 0.99  # 時間折扣率

TOTAL_FRAMES=10e6  #  用於學習的總影格數
NUM_UPDATES = int(TOTAL_FRAMES / NUM_ADVANCED_STEP / NUM_PROCESSES)  # 神經網路的總更新次數
# NUM_UPDATESは125,000となる


In [7]:
# 計算A2C損失函數的常數
value_loss_coef = 0.5
entropy_coef = 0.01
max_grad_norm = 0.5

# 設定學習手法RMSprop
lr = 7e-4
eps = 1e-5
alpha = 0.99


In [8]:
# 使用GPU的設定
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)


cpu


In [9]:
# 定義記憶體物件


class RolloutStorage(object):
    '''學習Advantage所需的記憶體類別'''

    def __init__(self, num_steps, num_processes, obs_shape):

        self.observations = torch.zeros(
            num_steps + 1, num_processes, *obs_shape).to(device)
        # 使用*取出()的內容
        # obs_shape→(4,84,84)
        # *obs_shape→ 4 84 84

        self.masks = torch.ones(num_steps + 1, num_processes, 1).to(device)
        self.rewards = torch.zeros(num_steps, num_processes, 1).to(device)
        self.actions = torch.zeros(
            num_steps, num_processes, 1).long().to(device)

        # 儲存折扣報酬總和
        self.returns = torch.zeros(num_steps + 1, num_processes, 1).to(device)
        self.index = 0  # 要insert的索引

    def insert(self, current_obs, action, reward, mask):
        '''在下一個index存入transition'''
        self.observations[self.index + 1].copy_(current_obs)
        self.masks[self.index + 1].copy_(mask)
        self.rewards[self.index].copy_(reward)
        self.actions[self.index].copy_(action)

        self.index = (self.index + 1) % NUM_ADVANCED_STEP  # インデックスの更新

    def after_update(self):
        '''假設執行Advantage的step數歸0，將最新的學習內容存入index0'''
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value):
        '''加總Advantage每一步的折扣報酬'''

        # 注意：從第5步之後開始逆向計算
        # 注意：第5步為Advantage1。第4步為Advantage2。・・・
        self.returns[-1] = next_value
        for ad_step in reversed(range(self.rewards.size(0))):
            self.returns[ad_step] = self.returns[ad_step + 1] * \
                GAMMA * self.masks[ad_step + 1] + self.rewards[ad_step]


In [10]:
# 建置A2C的深度神經網路


def init(module, gain):
    '''定義初始化每層連結參數的函數'''
    nn.init.orthogonal_(module.weight.data, gain=gain)
    nn.init.constant_(module.bias.data, 0)
    return module


class Flatten(nn.Module):
    '''定義將卷積層的輸出影像轉換成一維度的層'''

    def forward(self, x):
        return x.view(x.size(0), -1)


class Net(nn.Module):
    def __init__(self, n_out):
        super(Net, self).__init__()

        # 連結參數的初始化函數
        def init_(module): return init(
            module, gain=nn.init.calculate_gain('relu'))

        # 卷積層的定義
        self.conv = nn.Sequential(
            # 影像大小的變化84*84→20*20
            init_(nn.Conv2d(NUM_STACK_FRAME, 32, kernel_size=8, stride=4)),
            # 堆疊的flame為4影像，所以設定input=NUM_STACK_FRAME=4、並將輸出值設定為32
            # size的計算  size = (Input_size - Kernel_size + 2*Padding_size)/ Stride_size + 1

            nn.ReLU(),
            # 影像大小的變化20*20→9*9
            init_(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
            nn.ReLU(),
            init_(nn.Conv2d(64, 64, kernel_size=3, stride=1)),  # 影像大小的變化9*9→7*7
            nn.ReLU(),
            Flatten(),  # 將影像格式轉換成一維度
            init_(nn.Linear(64 * 7 * 7, 512)),  # 將64張的7×7影像轉換成512維度的output
            nn.ReLU()
        )

        # 初始化連結參數的函數
        def init_(module): return init(module, gain=1.0)

        # Critic的定義
        self.critic = init_(nn.Linear(512, 1))  # 因為是狀態價值，所以輸出值只有一個

        # 初始化連結參數的函數
        def init_(module): return init(module, gain=0.01)

        # Actor的定義
        self.actor = init_(nn.Linear(512, n_out))  # 行動を決めるので出力は行動の種類数

        # 將神經網路切換成訓練模式
        self.train()

    def forward(self, x):
        '''定義神經網路的前向計算'''
        input = x / 255.0  # 將影像的像素值從0-255標準化為0-1
        conv_output = self.conv(input)  # Convolution層的計算
        critic_output = self.critic(conv_output)  # 計算狀態價值
        actor_output = self.actor(conv_output)  # 計算行動

        return critic_output, actor_output

    def act(self, x):
        '''在狀態x底下以機率計算動作'''
        value, actor_output = self(x)
        probs = F.softmax(actor_output, dim=1)    # 以dim=1沿著動作的種類方向進行計算
        action = probs.multinomial(num_samples=1)

        return action

    def get_value(self, x):
        '''根據狀態x計算狀態價值'''
        value, actor_output = self(x)

        return value

    def evaluate_actions(self, x, actions):
        '''根據狀態x計算狀態價值、動作action的log機率與熵值'''
        value, actor_output = self(x)

        log_probs = F.log_softmax(actor_output, dim=1)  # 以dim=1沿著動作的種類方向進行計算
        action_log_probs = log_probs.gather(1, actions)  # 計算實際動作的log_probs

        probs = F.softmax(actor_output, dim=1)  # 以dim=1沿著動作的種類方向進行計算
        dist_entropy = -(log_probs * probs).sum(-1).mean()

        return value, action_log_probs, dist_entropy


In [11]:
# 定義智能體的大腦類別，所有智能體共用這個類別


class Brain(object):
    def __init__(self, actor_critic):

        self.actor_critic = actor_critic  # actor_critic是Net類別的深度神經網路

        # 載入連結參數的情況
        filename = 'weight_end.pth'
        #filename = 'weight_112500.pth'
        param = torch.load(filename, map_location='cpu')
        self.actor_critic.load_state_dict(param)

        # 設定更新參數的梯度下降法
        self.optimizer = optim.RMSprop(
            actor_critic.parameters(), lr=lr, eps=eps, alpha=alpha)

    def update(self, rollouts):
        '''於Advantage計算的5個step都用過後再更新'''
        obs_shape = rollouts.observations.size()[2:]  # torch.Size([4, 84, 84])
        num_steps = NUM_ADVANCED_STEP
        num_processes = NUM_PROCESSES

        values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(
            rollouts.observations[:-1].view(-1, *obs_shape),
            rollouts.actions.view(-1, 1))

        # 注意：各變數的大小
        # rollouts.observations[:-1].view(-1, *obs_shape) torch.Size([80, 4, 84, 84])
        # rollouts.actions.view(-1, 1) torch.Size([80, 1])
        # values torch.Size([80, 1])
        # action_log_probs torch.Size([80, 1])
        # dist_entropy torch.Size([])

        values = values.view(num_steps, num_processes,
                             1)  # torch.Size([5, 16, 1])
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values  # torch.Size([5, 16, 1])
        value_loss = advantages.pow(2).mean()

        action_gain = (advantages.detach() * action_log_probs).mean()
        # 執行detach，將advantages當成變數使用

        total_loss = (value_loss * value_loss_coef -
                      action_gain - dist_entropy * entropy_coef)

        self.optimizer.zero_grad()  # 重設梯度
        total_loss.backward()  # 反向傳播演算法
        nn.utils.clip_grad_norm_(self.actor_critic.parameters(), max_grad_norm)
        #  為了避免連結參數一下子變化太快，梯度的大小最多為0.5

        self.optimizer.step()  # 更新連結參數


In [12]:
# 這是執行Breakout的環境的類別

NUM_PROCESSES = 1


class Environment:
    def run(self):

        # 設定seed
        seed_num = 1
        torch.manual_seed(seed_num)
        if use_cuda:
            torch.cuda.manual_seed(seed_num)

        # 建置執行環境
        torch.set_num_threads(seed_num)
        envs = [make_env(ENV_NAME, seed_num, i) for i in range(NUM_PROCESSES)]
        envs = SubprocVecEnv(envs)  # 轉換成多工執行環境

        # 建立所有智能體共用的大腦Brain
        n_out = envs.action_space.n  # 動作的種類共有4種
        actor_critic = Net(n_out).to(device)  # 切換成GPU環境
        global_brain = Brain(actor_critic)

        # 建立儲存專用變數
        obs_shape = envs.observation_space.shape  # (1, 84, 84)
        obs_shape = (obs_shape[0] * NUM_STACK_FRAME,
                     *obs_shape[1:])  # (4, 84, 84)
        # torch.Size([16, 4, 84, 84])
        current_obs = torch.zeros(NUM_PROCESSES, *obs_shape).to(device)
        rollouts = RolloutStorage(
            NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape)  # rollouts的物件
        episode_rewards = torch.zeros([NUM_PROCESSES, 1])  # 儲存目前回合的報酬
        final_rewards = torch.zeros([NUM_PROCESSES, 1])  # 儲存最終回合的報酬

        # 初始狀態的啟動
        obs = envs.reset()
        obs = torch.from_numpy(obs).float()  # torch.Size([16, 1, 84, 84])
        current_obs[:, -1:] = obs  # 於第4格flame儲存最新的obs

        # 於advanced学習專用物件rollouts的第一個狀態儲存目前的狀態
        rollouts.observations[0].copy_(current_obs)

        # 繪製影像專用的環境（播放影像的追加部分）
        env_play = make_env_play(ENV_NAME, seed_num, 0)
        obs_play = env_play.reset()

        # 儲存用於製作動畫的影像（播放影像的追加部分）
        frames = []
        main_end = False

        # 執行迴圈
        for j in tqdm(range(NUM_UPDATES)):

            # 超過預設的報酬就結束執行（播放影像的追加部分）
            if main_end:
                break

            # 於advanced學習的每個step計算
            for step in range(NUM_ADVANCED_STEP):

                # 計算動作
                with torch.no_grad():
                    action = actor_critic.act(rollouts.observations[step])

                cpu_actions = action.squeeze(1).cpu().numpy()  # 將tensor轉換成NumPy

                # 1step的多工執行，傳回值obs的大小size為(16, 1, 84, 84)
                obs, reward, done, info = envs.step(cpu_actions)

                # 將報酬轉換成tensor、於每回合的總報酬累計
                # 將size為(16,)的轉換成(16, 1)
                reward = np.expand_dims(np.stack(reward), 1)
                reward = torch.from_numpy(reward).float()
                episode_rewards += reward

                # 各執行環境的狀態不同，若為done就將mask設定為0、若仍在執行，將mask設定為1
                masks = torch.FloatTensor(
                    [[0.0] if done_ else [1.0] for done_ in done])

                # 更新最後回合的總報酬
                final_rewards *= masks  # 若仍在執行就乘以1，若已經結束執行就乘以0，予以重設
                # 若還在執行就加0，若已經是done就加入episode_rewards
                final_rewards += (1 - masks) * episode_rewards

                # 取得影像(播放影像的追加部分）
                obs_play, reward_play, _, _ = env_play.step(cpu_actions[0])
                frames.append(obs_play)  # 儲存轉換後的影像
                if done[0]:  # 第一個多工環境結束時
                    print(episode_rewards[0][0].numpy())  # 報酬

                    # 報酬超過300就結束程式
                    if (episode_rewards[0][0].numpy()) > 300:
                        main_end = True
                        break
                    else:
                        obs_view = env_play.reset()
                        frames = []  # 重設儲存的影像

                # 更新每回合的總報酬
                episode_rewards *= masks  # 由於還在執行時的mask為1，所以不會有所改變、若已結束就乘以0

                # 將masks切換成GPU
                masks = masks.to(device)

                # 若已是done就將目前的狀態全部設定為0
                # 將mask的サイズ從torch.Size([16, 1])轉換成torch.Size([16, 1, 1 ,1])、再執行乘法
                current_obs *= masks.unsqueeze(2).unsqueeze(2)

                # 堆疊frame
                # torch.Size([16, 1, 84, 84])
                obs = torch.from_numpy(obs).float()
                current_obs[:, :-1] = current_obs[:, 1:]  # 將第1～3個的obs覆寫至第0～2個obs
                current_obs[:, -1:] = obs  # 第4格儲存最新的obs

                # 將目前步驟的transition插入記憶體物件
                rollouts.insert(current_obs, action.data, reward, masks)

            # advanced的for loop結束

            # 從advanced的最後step的狀態計算預設的狀態價值
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.observations[-1]).detach()

            # 計算所有step的折扣報酬總和，更新rollouts的變數returns
            rollouts.compute_returns(next_value)

            # 更新神經網路與rollout
            # global_brain.update(rollouts)
            rollouts.after_update()

        # 結束執行迴圈
        display_frames_as_gif(frames)  # 儲存與播放影片


In [None]:
# 執行
breakout_env = Environment()
frames = breakout_env.run()