## 6.4 利用PyTorch建置Prioritized Experience Replay

In [1]:
# import套件
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym


In [2]:
# 宣告繪製動畫的繪圖函數
# 参考URL http://nbviewer.jupyter.org/github/patrickmineault
# /xcorr-notebooks/blob/master/Render%20OpenAI%20gym%20as%20GIF.ipynb
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display


def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0),
               dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),
                                   interval=50)

    anim.save('movie_cartpole_prioritized_experience_replay.mp4')  # 命名與儲存動畫檔案
    display(display_animation(anim, default_mode='loop'))


In [3]:
# 產生namedtuple
from collections import namedtuple

Transition = namedtuple(
    'Transition', ('state', 'action', 'next_state', 'reward'))


In [4]:
# 設定常數
ENV = 'CartPole-v0'  # 使用的課題名稱
GAMMA = 0.99  # 時間折扣率
MAX_STEPS = 200  # 1回合的step數
NUM_EPISODES = 500  # 最大回合數


In [5]:
# 定義儲存經驗的記憶體類別


class ReplayMemory:

    def __init__(self, CAPACITY):
        self.capacity = CAPACITY  # 記憶體的最大長度
        self.memory = []  # 儲存經驗的變數
        self.index = 0  # 代表儲存index的變數

    def push(self, state, action, state_next, reward):
        '''將transition = (state, action, state_next, reward)儲存於記憶體'''

        if len(self.memory) < self.capacity:
            self.memory.append(None)  # 若記憶體還有剩餘空間就累加

        # 使用namedtuple的Transition儲存值與對應的欄位名稱
        self.memory[self.index] = Transition(state, action, state_next, reward)

        self.index = (self.index + 1) % self.capacity  # 讓index遞增1

    def sample(self, batch_size):
        '''依照batch_size的大小，隨機取出儲存的內容'''
        return random.sample(self.memory, batch_size)

    def __len__(self):
        '''將變數memory目前的長度傳給函數len'''
        return len(self.memory)


In [6]:
# 定義儲存TD誤差的記憶體類別

TD_ERROR_EPSILON = 0.0001  # 加給誤差的偏差項


class TDerrorMemory:

    def __init__(self, CAPACITY):
        self.capacity = CAPACITY  # 記憶體的最大長度メモリの最大長さ
        self.memory = []  # 儲存經驗的變數
        self.index = 0  # 代表儲存index的變數

    def push(self, td_error):
        '''將TD誤差存入記憶體'''

        if len(self.memory) < self.capacity:
            self.memory.append(None)  # 若記憶體還有剩餘空間就累加

        self.memory[self.index] = td_error
        self.index = (self.index + 1) % self.capacity  # 讓index遞增1

    def __len__(self):
        '''將變數memory目前的長度傳給函數len'''
        return len(self.memory)

    def get_prioritized_indexes(self, batch_size):
        '''以對應TD誤差的機率取得index'''

        # 加總TD誤差
        sum_absolute_td_error = np.sum(np.absolute(self.memory))
        sum_absolute_td_error += TD_ERROR_EPSILON * len(self.memory)  # 加上極小值

        # 產生batch_size量的亂數，並依照昇冪排序
        rand_list = np.random.uniform(0, sum_absolute_td_error, batch_size)
        rand_list = np.sort(rand_list)

        # 以剛剛產生的亂數算出索引值
        indexes = []
        idx = 0
        tmp_sum_absolute_td_error = 0
        for rand_num in rand_list:
            while tmp_sum_absolute_td_error < rand_num:
                tmp_sum_absolute_td_error += (
                    abs(self.memory[idx]) + TD_ERROR_EPSILON)
                idx += 1

            # 因為計算時使用了極小值，所以在index超過記憶體長度時予以修正
            if idx >= len(self.memory):
                idx = len(self.memory) - 1
            indexes.append(idx)

        return indexes

    def update_td_error(self, updated_td_errors):
        '''更新TD誤差'''
        self.memory = updated_td_errors


In [7]:
# 建置深度神經網路
# 神經網路的設定（Chainer的語法）
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self, n_in, n_mid, n_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in, n_mid)
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.fc3 = nn.Linear(n_mid, n_out)

    def forward(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        output = self.fc3(h2)
        return output


In [8]:
# 這是相當於智能體大腦的類別、將執行PrioritizedExperienceReplay

import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

BATCH_SIZE = 32
CAPACITY = 10000


class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions  # 取得CartPole的動作（往右或往左）的2

        # 產生記憶經驗的記憶體物件
        self.memory = ReplayMemory(CAPACITY)

        # 建置神經網路
        n_in, n_mid, n_out = num_states, 32, num_actions
        self.main_q_network = Net(n_in, n_mid, n_out)  # 使用Net類別
        self.target_q_network = Net(n_in, n_mid, n_out)  # 使用Net類別
        print(self.main_q_network)  # 輸出神經網路的形狀

        # 設定最佳化手法
        self.optimizer = optim.Adam(
            self.main_q_network.parameters(), lr=0.0001)

        # 產生TD誤差的記憶體物件
        self.td_error_memory = TDerrorMemory(CAPACITY)

    def replay(self, episode):
        '''以Experience Replay學習神經網路的連結參數'''

        # 1. 確認記憶體大小
        if len(self.memory) < BATCH_SIZE:
            return

        # 2. 建立小批次資料
        self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
            episode)

        # 3. 計算指令訊號的Q(s_t, a_t)值
        self.expected_state_action_values = self.get_expected_state_action_values()

        # 4. 更新連結參數
        self.update_main_q_network()

    def decide_action(self, state, episode):
        '''根據目前狀態採取動作'''
        # 以ε-greedy法採用最佳動作
        epsilon = 0.5 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.main_q_network.eval()  # 將神經網路切換成推論模式
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1, 1)
            # 取得神經網路的最大輸出值的index = max(1)[1]
            # .view(1,1)會將[torch.LongTensor of size 1]　轉換成 size 1x1

        else:
            # 隨機傳回0,1的動作
            action = torch.LongTensor(
                [[random.randrange(self.num_actions)]])  # 隨機傳回0,1的動作
            # action轉換成[torch.LongTensor of size 1x1]的格式

        return action

    def make_minibatch(self, episode):
        '''2. 建立小批次資料'''

        # 2.1 從記憶體取得小批次量的資料
        if episode < 30:
            transitions = self.memory.sample(BATCH_SIZE)
        else:
            # 變更為依照TD誤差取出小批次資料
            indexes = self.td_error_memory.get_prioritized_indexes(BATCH_SIZE)
            transitions = [self.memory.memory[n] for n in indexes]

        # 2.2 將各變數變形成支援小批次資料的格式
        # transitions會依照BATCH_SIZE的大小儲存儲存每個1step的(state, action, state_next, reward)
        # 換言之，小批次資料的大小就是、(state, action, state_next, reward)×BATCH_SIZE
        # 也就是說
        # 轉換成(state×BATCH_SIZE, action×BATCH_SIZE, state_next×BATCH_SIZE, reward×BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        # 2.3 將各變數的元素變形成支援小批次大小的格式，為了方便於神經網路操作，設定為Variable
        # 以state為例，原本有BATCH_SIZE量的[torch.FloatTensor of size 1x4]，
        # 會在這裡轉換成torch.FloatTensor of size BATCH_SIZEx4
        # 建立狀態、動作、報酬、non_final狀態的小批次Variable
        # cat就是Concatenates（連結）的意思。
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                           if s is not None])

        return batch, state_batch, action_batch, reward_batch, non_final_next_states

    def get_expected_state_action_values(self):
        '''3. 計算指令訊號的Q（St,at）值'''

        # 3.1 讓神經網路切換成推論模式
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 3.2 計算神經網路輸出的Q(s_t, a_t)
        # self.model(state_batch)會輸出右、左兩側的Q值
        # 所以會是[torch.FloatTensor of size BATCH_SIZEx2]。
        # 為了求得與動作a_t對應的Q值，計算以action_batch執行的動作a_t為右還是左的index
        # 以gather抽出與index對應的Q值。
        self.state_action_values = self.main_q_network(
            self.state_batch).gather(1, self.action_batch)

        # 3.3 計算max{Q(s_t+1, a)}的值。不過要注意是否還有下個狀態。

        # 建立確認cartpole還不是done、以及是否有next_state的索引遮罩
        non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None,
                                                    self.batch.next_state)))
        # 先將所有狀態設定為0
        next_state_values = torch.zeros(BATCH_SIZE)
        a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)

        # 從Main Q-Network計算下個狀態下的Q值最大動作a_m
        # 於最後的[1]傳回與動作對應的index
        a_m[non_final_mask] = self.main_q_network(
            self.non_final_next_states).detach().max(1)[1]

        # 只在有下個狀態時套用遮罩，將size 32轉換成32×1
        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        # 根據下個狀態的index，從target Q-Network算出動作a_m的Q值
        # 以detach()取出
        # 以squeeze()將size[minibatch×1]轉換[minibatch]。
        next_state_values[non_final_mask] = self.target_q_network(
            self.non_final_next_states).gather(1, a_m_non_final_next_states).detach().squeeze()

        # 根據Q學習的公式學習指令訊號Q(s_t, a_t)
        expected_state_action_values = self.reward_batch + GAMMA * next_state_values

        return expected_state_action_values

    def update_main_q_network(self):
        '''4. 更新連結參數'''

        # 4.1 將神經網路切換成訓練模式
        self.main_q_network.train()

        # 4.2 計算損失函數（smooth_l1_lossはHuberloss）
        # expected_state_action_values的
        # size已是[minbatch]、所以利用unsqueeze遞增為[minibatch x 1]
        loss = F.smooth_l1_loss(self.state_action_values,
                                self.expected_state_action_values.unsqueeze(1))

       # 4.3 更新連結參數
        self.optimizer.zero_grad()  # 重設梯度
        loss.backward()  # 反向傳播演算法
        self.optimizer.step()  # 更新連結參數

    def update_target_q_network(self):  # 於DDQN新增
        '''讓Target Q-Network等於Main'''
        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

    def update_td_error_memory(self):  # 於PrioritizedExperienceReplay新增的部分
        '''更新於TD誤差記憶體儲存的TD誤差'''

        # 將神經網路切換成推論模式
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 以所有記憶體建立小批次資料
        transitions = self.memory.memory
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                           if s is not None])

        # 計算神經網路輸出的Q(s_t, a_t)
        state_action_values = self.main_q_network(
            state_batch).gather(1, action_batch)

        # 建立確認cartpole還不是done、以及是否有next_state的索引遮罩
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # 先全部設定為0、大小為記憶體的長度
        next_state_values = torch.zeros(len(self.memory))
        a_m = torch.zeros(len(self.memory)).type(torch.LongTensor)

        # 從Main Q-Network計算下個狀態下的Q值最大動作a_m
        # 於最後的[1]傳回與動作對應的index
        a_m[non_final_mask] = self.main_q_network(
            non_final_next_states).detach().max(1)[1]

        # 只在有下個狀態時套用遮罩，將size 32轉換成32×1
        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        # 根據下個狀態的index，從target Q-Network算出動作a_m的Q值
        # 以detach()取出
        # 以squeeze()將size[minibatch×1]轉換[minibatch]。
        next_state_values[non_final_mask] = self.target_q_network(
            non_final_next_states).gather(1, a_m_non_final_next_states).detach().squeeze()

        # 計算TD誤差
        td_errors = (reward_batch + GAMMA * next_state_values) - \
            state_action_values.squeeze()
        # state_action_values為size[minibatch×1]，所以執行squeeze，轉換成size[minibatch]

        # 更新TD誤差記憶體，再以detach取出Tensor、轉換成NumPy之後，再轉換成Python的list
        self.td_error_memory.memory = td_errors.detach().numpy().tolist()


In [9]:
# 是於CartPole執行的智能體類別、等同立有棒子的推車


class Agent:
    def __init__(self, num_states, num_actions):
        '''設定課題的狀態與動作數量'''
        self.brain = Brain(num_states, num_actions)  # 為了讓智能體自行決定動作的頭腦

    def update_q_function(self, episode):
        '''更新Q函數'''
        self.brain.replay(episode)

    def get_action(self, state, episode):
        '''決定動作'''
        action = self.brain.decide_action(state, episode)
        return action

    def memorize(self, state, action, state_next, reward):
        '''將state, action, state_next, reward的內容存入memory物件'''
        self.brain.memory.push(state, action, state_next, reward)

    def update_target_q_function(self):
        '''讓Target Q-Network更新為與Main Q-Network相同的狀態'''
        self.brain.update_target_q_network()
        
    def memorize_td_error(self, td_error):  # 於PrioritizedExperienceReplay新增的內容
        '''將TD誤差存入TD誤差記憶體'''
        self.brain.td_error_memory.push(td_error)
        
    def update_td_error_memory(self):  # 於PrioritizedExperienceReplay新增的內容
        '''更新TD誤差記憶體的TD誤差'''
        self.brain.update_td_error_memory()
        

In [10]:
# 這是執行CartPole的環境的類別


class Environment:

    def __init__(self):
        self.env = gym.make(ENV)  # 設定要執行的課題
        num_states = self.env.observation_space.shape[0]  # 設定課題的狀態與動作的數量
        num_actions = self.env.action_space.n  # 取得CartPole的動作（往右或往左）的2
        # 建立於上述環境採取動作的Agent
        self.agent = Agent(num_states, num_actions)

    def run(self):
        '''執行'''
        episode_10_list = np.zeros(10)  # 儲存10回合的step、再輸出平均step數
        complete_episodes = 0  # 棒子連續站立達195step以上的回合的數量
        episode_final = False  # 最後一回合的旗標
        frames = []  # 建立儲存影像的變數，以便將最後一回合畫成動畫

        for episode in range(NUM_EPISODES):  # 依照最大回合數重覆執行
            observation = self.env.reset()  # 環境的初始化

            state = observation  # 將觀測結果直接當成狀態s使用
            state = torch.from_numpy(state).type(
                torch.FloatTensor)  # 將NumPy變數轉換成PyTorch的張量
            state = torch.unsqueeze(state, 0)  # 將size 4轉換成size 1x4

            for step in range(MAX_STEPS):  # 單一回合的迴圈

                # 將繪製動畫的部分轉換成註解
                # if episode_final is True:   # 於最後一回合的時候，將各時間的影像存入frames
                    # frames.append(self.env.render(mode='rgb_array'))

                action = self.agent.get_action(state, episode)  # 求出動作

                # 執行動作a_t後，算出s_{t+1}與done旗標
                # 根據action指定.item()、再取得內容
                observation_next, _, done, _ = self.env.step(
                    action.item())  # 不會用到reward與info，所以設定為_

                # 給予報酬，設定episode的結束評價與state_next
                if done:  # 當步驟數超過200或是棒子過度傾倒，done就會轉換成true
                    state_next = None  # 沒有下個狀態，所以存入None

                    # 將最新10episode的step存入list
                    episode_10_list = np.hstack(
                        (episode_10_list[1:], step + 1))

                    if step < 195:
                        reward = torch.FloatTensor(
                            [-1.0])  # 棒子在中途傾倒的話，給予報酬-1作為懲罰
                        complete_episodes = 0  # 重設連續成功的記錄
                    else:
                        reward = torch.FloatTensor([1.0])  # 若棒子仍然直立，給予報酬1
                        complete_episodes = complete_episodes + 1  # 更新連續站立記錄
                else:
                    reward = torch.FloatTensor([0.0])  # 在平常狀態下，報酬為0
                    state_next = observation_next  # 將觀測結果當成狀態使用
                    state_next = torch.from_numpy(state_next).type(
                        torch.FloatTensor)  # 將numpy變數轉換成PyTorch的張量
                    state_next = torch.unsqueeze(
                        state_next, 0)  # 將size 4轉換成size 1x4

                # 將學習經驗存入記憶體
                self.agent.memorize(state, action, state_next, reward)

                # 將TD誤差存入TD誤差記憶體
                self.agent.memorize_td_error(0)  # 這裡本該存入TD誤差，但先存入0

                # 以PrioritizedExperienceReplay更新Q函數
                self.agent.update_q_function(episode)

                # 觀測狀態的更新
                state = state_next

                # 結束時的處理
                if done:
                    print('%d Episode: Finished after %d steps：10回合的平均step = %.1lf' % (
                        episode, step + 1, episode_10_list.mean()))

                    # TD誤差メモリの中身を更新する
                    self.agent.update_td_error_memory()

                    # 於DDQN新增的部分。每2回合將Target Q-Network複製成Main的狀態
                    if(episode % 2 == 0):
                        self.agent.update_target_q_function()
                    break

            if episode_final is True:
                # 將繪製動畫的部分轉換成註解
                # 儲存與繪製動畫
                #display_frames_as_gif(frames)
                break

            # 連續10回合以及直立超過200step代表學習完成
            if complete_episodes >= 10:
                print('連續10次成功')
                episode_final = True  # 將下個回合給設定為繪製動畫的最後一個回合


In [11]:
# main 類別
cartpole_env = Environment()
cartpole_env.run()


Net(
  (fc1): Linear(in_features=4, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)
0 Episode: Finished after 10 steps：10回合的平均step = 1.0
1 Episode: Finished after 14 steps：10回合的平均step = 2.4
2 Episode: Finished after 10 steps：10回合的平均step = 3.4
3 Episode: Finished after 10 steps：10回合的平均step = 4.4
4 Episode: Finished after 8 steps：10回合的平均step = 5.2
5 Episode: Finished after 8 steps：10回合的平均step = 6.0
6 Episode: Finished after 9 steps：10回合的平均step = 6.9
7 Episode: Finished after 12 steps：10回合的平均step = 8.1
8 Episode: Finished after 10 steps：10回合的平均step = 9.1
9 Episode: Finished after 9 steps：10回合的平均step = 10.0
10 Episode: Finished after 9 steps：10回合的平均step = 9.9
11 Episode: Finished after 10 steps：10回合的平均step = 9.5
12 Episode: Finished after 11 steps：10回合的平均step = 9.6
13 Episode: Finished after 9 steps：10回合的平均step = 9.5
14 Episode: Finished after 11 steps：10回合的平均step = 9.8
15 Episode: Finished

144 Episode: Finished after 123 steps：10回合的平均step = 169.9
145 Episode: Finished after 127 steps：10回合的平均step = 162.6
146 Episode: Finished after 200 steps：10回合的平均step = 163.7
147 Episode: Finished after 156 steps：10回合的平均step = 159.3
148 Episode: Finished after 188 steps：10回合的平均step = 162.8
149 Episode: Finished after 161 steps：10回合的平均step = 158.9
150 Episode: Finished after 141 steps：10回合的平均step = 153.0
151 Episode: Finished after 142 steps：10回合的平均step = 153.9
152 Episode: Finished after 171 steps：10回合的平均step = 158.5
153 Episode: Finished after 149 steps：10回合的平均step = 155.8
154 Episode: Finished after 124 steps：10回合的平均step = 155.9
155 Episode: Finished after 158 steps：10回合的平均step = 159.0
156 Episode: Finished after 170 steps：10回合的平均step = 156.0
157 Episode: Finished after 159 steps：10回合的平均step = 156.3
158 Episode: Finished after 130 steps：10回合的平均step = 150.5
159 Episode: Finished after 180 steps：10回合的平均step = 152.4
160 Episode: Finished after 141 steps：10回合的平均step = 152.4
161 Episode: F

KeyboardInterrupt: 