# 範例 : Transformer decoder
***
- 實做 Transformer decoder 以更了解　Transformer 
- 應用 Transformer decoder 建立一個簡單的 ptt 貼文回應器 驗證 Transformer decoder 可以運行

# [教學目標]
- 了解如何實作 transformer decoder 和其結構
- 了解如何應用 transformer decoder 並證明 decoder 可以作用


# [範例重點]
- 觀察 TransformerDecoder 的建立
- 觀察 TransformerDecoderLayer 的建立
-- 使用 encoder 相同的 MultiHeadAttentionSubLayer
-- 使用 encoder 相同的 PosFeedForwardSubLayer
- 觀察如何使用 建立的 TransformerDecoder 
-- 使用 TransformerDecoder 做序列生成 SequenceGenerate
-- 如何使用 SequenceGenerate 模型 訓練一個 ptt 回應機

# [範例結構]
- TransformerDecoder 模型和 SequenceGenerate 實作
- ptt 資料準備
- 應用 SequenceGenerate 訓練 ptt answer machine

In [1]:
# import 需要的 packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from sklearn.model_selection import train_test_split
import csv

import numpy as np
import re
import random
import math
import time

import csv
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
from torchtext.data import Field, BucketIterator, TabularDataset

In [11]:
# 連接個人資料 讀取 ＰＴＴ 訓練資料和儲存模型
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 實做 TransformerDecoder
- 如果只用 Transfomer decoder 而已 不和 encoder　一起使用 
-- skip_encoder_attn 不需要和 encoder attention
-- enc_hidden　和 enc_mask　不用輸入

In [4]:
## 啟動參數
## hidden_dim 內部 embedding 大小
## feedforward_dim  feedforward 中間層大小
## n_dec_layers 幾層 Transformer Layers
## n_attn_heads 幾個 attention heads 
## dropout dropout 比例
## dec_voca_length  字彙集合大小
## max_pos_length  最大 decode 序列長度
## device 
## skip_encoder_attn 不需要和 encoder attention

## 輸入值
## dec_seq 解碼序列　（句子）
## enc_hidden　編碼的　hidden embedding　(Optional)
## dec_mask 解碼遮罩
## enc_mask　編碼遮罩　(Optional)

class TransformerDecoder(nn.Module):
  def __init__(self, hidden_dim, feedforward_dim, n_dec_layers, n_attn_heads, dropout, dec_voca_length, max_pos_length , device , skip_encoder_attn = False):
    super().__init__()
    self.device = device

    # 建立 decoder token embedding 
    self.dec_tok_embedding = nn.Embedding(dec_voca_length, hidden_dim )
    # 建立 decoder position embedding 
    self.dec_pos_embedding = nn.Embedding(max_pos_length, hidden_dim)

    # 建立 n_dec_layers 個 TransformerDecoderLayer 層
    self.transformer_decoder_layers = nn.ModuleList([TransformerDecoderLayer(hidden_dim,
                                          feedforward_dim, 
                                          n_dec_layers,
                                          n_attn_heads,
                                          dropout, 
                                          device, skip_encoder_attn) for _ in range(n_dec_layers)])

    # 輸出層 輸出 vocabulary 個長度
    self.full_conn_out = nn.Linear(hidden_dim, dec_voca_length)
    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

  def forward(self, dec_seq, enc_hidden , dec_mask, enc_mask):
    #dec_seq 輸入 tensor 形狀 [batch size, decode sequence len]
    #enc_hidden 輸入 tensor 形狀 [batch size, encode sequence len, hid dim] # optional 不需要時輸入空值
    #dec_mask 輸入 tensor 形狀 [batch size, decode sequence len]
    #enc_mask 輸入 tensor 形狀 [batch size, encode sequence len] # optional 不需要時輸入空值
                
    batch_size = dec_seq.shape[0]
    dec_len = dec_seq.shape[1]
        
    pos = torch.arange(0, dec_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
    #pos 的 tensor 形狀 [batch size, decode sequence len]
            
    # 將 decoder token embedding 加上 decoder postion embedding
    dec_seq = self.dropout(self.dec_tok_embedding(dec_seq)  + self.dec_pos_embedding(pos))
                
    #dec_seq 輸出 tensor 形狀 [batch size, decode sequence len, hid dim]
        
    for layer in self.transformer_decoder_layers:
      dec_seq, encoder_decoder_attention , decoder_self_attention = layer(dec_seq, enc_hidden, dec_mask, enc_mask)
        
    #dec_seq 輸出 tensor 形狀 [batch size, decode sequence  len, hid dim]
    #attention 輸出 tensor 形狀 [batch size, n heads, trg len, src len]
        
    output = self.full_conn_out(dec_seq)
        
    #output tensor 形狀 [batch size, trg len, output dim]
            
    return output, encoder_decoder_attention , decoder_self_attention


# 實做 TransformerDecoderLayer
- 實作在transformerDecoder 使用多層 的TransformerDecoderLayer
- 如果只使用 decoder 則不用 encoder attention, --> skip_encoder_attn = True 

In [5]:
## 啟動參數
## hidden_dim 內部 embedding 大小
## feedforward_dim  feedforward 中間層大小
## n_dec_layers 幾層 Transformer Layers
## n_attn_heads 幾個 attention heads 
## dropout dropout 比例
## device 
## skip_encoder_attn 不需要和 encoder attention

## 輸入值
## dec_seq 解碼序列　（句子）
## enc_hidden　編碼的　hidden embedding　(Optional)
## dec_mask 解碼遮罩
## enc_mask　編碼遮罩　(Optional)

class TransformerDecoderLayer(nn.Module):
  def __init__(self, hidden_dim , feedforward_dim, n_dec_layers, n_attn_heads, dropout , device , skip_encoder_attn = False):
    super().__init__()

    self.skip_encoder_attn = skip_encoder_attn 

    self.self_attention_sublayer = MultiHeadAttentionSubLayer(hidden_dim, n_attn_heads, dropout, device)
    self.self_attn_layernorm = nn.LayerNorm(hidden_dim)

    if not skip_encoder_attn:
      self.encoder_attention_sublayer = MultiHeadAttentionSubLayer(hidden_dim, n_attn_heads, dropout, device)
      self.encoder_attn_layernorm = nn.LayerNorm(hidden_dim)

    self.positionwise_feedforward = PosFeedForwardSubLayer(hidden_dim,feedforward_dim ,dropout)
    self.feedforward_layernorm = nn.LayerNorm(hidden_dim)
    
    self.dropout = nn.Dropout(dropout)     

  def forward(self, dec_seq, enc_hidden , dec_mask, enc_mask):
    #dec_seq 輸入 tensor 形狀 [batch size, decode sequence len, hid dim]
    #enc_hidden 輸入 tensor 形狀 [batch size, encode sequence len, hid dim] # optional 不需要時輸入空值
    #dec_mask 輸入 tensor 形狀 [batch size, decode sequence len]
    #enc_mask 輸入 tensor 形狀 [batch size, encode sequence len] # optional 不需要時輸入空值
        
    #self attention 子層
    _dec_seq, decoder_self_attention = self.self_attention_sublayer(dec_seq, dec_seq, dec_seq, dec_mask)
        
    #dropout, residual connection and layer norm　(Add and Norm)
    dec_seq = self.self_attn_layernorm(dec_seq + self.dropout(_dec_seq))
            
    #dec_seq  輸出 tensor 形狀 [batch size, decode sequence len, hid dim]
            
    # 需不需要建立　encoder attention 層        
    if not self.skip_encoder_attn:
      #encoder attention
      _dec_seq, encoder_decoder_attention = self.encoder_attention_sublayer(dec_seq, enc_hidden, enc_hidden, enc_mask)
          
      #dropout, residual connection and layer norm
      dec_seq = self.encoder_attn_layernorm(dec_seq + self.dropout(_dec_seq))
    else:
      encoder_decoder_attention = None
                    
    #dec_seq 輸出 tensor 形狀 [batch size, decode sequence len, hid dim]
    #positionwise feedforward
    _dec_seq = self.positionwise_feedforward(dec_seq)
        
    #dropout, residual and layer norm (Add and Norm)
    dec_seq = self.feedforward_layernorm(dec_seq + self.dropout(_dec_seq))
        
    #dec_seq 輸出 tensor 形狀 [batch size, decode sequence len, hid dim]
    #attention 輸出 tensor 形狀 [batch size, n heads, decode sequence len, encode sequence len]
        
    return dec_seq, encoder_decoder_attention , decoder_self_attention


# 實做 MultiHeadAttentionSubLayer
- 實作 encoder and decoder 同時共用的 MultiHeadAttention SubLayer 


In [6]:
## 啟動參數
## hidden_dim 內部 embedding 大小
## n_attn_heads 幾個 attention heads 
## dropout dropout 比例
## device 

## 輸入值
## query_input, --> K 
## key_input, --> Q
## value_input, --> V
## mask 遮罩

class MultiHeadAttentionSubLayer(nn.Module):
  def __init__(self, hidden_dim , n_attn_heads, dropout, device):
    super().__init__()

    # 確定 設定的 hidden layer 維度可以被 attention head 整除
    assert hidden_dim % n_attn_heads ==0

    # hidden layer 維度
    self.hidden_dim = hidden_dim

    # multi-heads 的個數
    self.n_attn_heads = n_attn_heads

    # 平均分到每個 multi-head 的 維度
    self.head_dim = hidden_dim // n_attn_heads

    # 就是在課程中提到的 Wq Wk Wv
    self.full_conn_q = nn.Linear(hidden_dim, hidden_dim)
    self.full_conn_k = nn.Linear(hidden_dim, hidden_dim)
    self.full_conn_v = nn.Linear(hidden_dim, hidden_dim)

    # 最後結果再過一層 線性轉換
    self.full_conn_o = nn.Linear(hidden_dim, hidden_dim)

    self.dropout = nn.Dropout(dropout)
        
    # 根據維度大小調整 attention 值 以免維度太大 Q dot K 結果過大影響學習效率    
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  

  def forward(self, query_input, key_input, value_input, mask = None):
    batch_size = query_input.shape[0]

    #query_input shape [batch size, query len, hid dim]
    #key_input shape [batch size, key len, hid dim]
    #value_input shape [batch size, value len, hid dim]

    Q = self.full_conn_q(query_input)
    K = self.full_conn_k(key_input)
    V = self.full_conn_v(value_input)

    #Q shape [batch size, query len, hid dim]
    #K shape [batch size, key len, hid dim]
    #V shape [batch size, value len, hid dim]

    # 將 attention 切成多塊小的 attention
    def split_attention(Q, K, V):
      Q = Q.view(batch_size, -1, self.n_attn_heads, self.head_dim)
      K = K.view(batch_size, -1, self.n_attn_heads, self.head_dim)
      V = V.view(batch_size, -1, self.n_attn_heads, self.head_dim)
      return Q , K , V

    # 將 attention 的 2 和 3 維度轉置 以達到將 attention head 提到前面 而分開每個 attention head
    def seperate_heads(Q, K, V):
      Q = Q.permute(0, 2, 1, 3) # (batch_size, self.n_heads , query len , self.head_dim)
      K = K.permute(0, 2, 1, 3) # (batch_size, self.n_heads , key len , self.head_dim)
      V = V.permute(0, 2, 1, 3) # (batch_size, self.n_heads , value len , self.head_dim)
      return Q , K , V

    Q, K, V = split_attention(Q, K, V)

    Q, K, V = seperate_heads (Q, K, V)

    
    # 調整過的 dot product attention, 由於之前分開了每個 attention head 
    # 所以現在只要把 Ｋ的最後兩個維度轉置 就可以 by attention head 求得 Q dot K
    scaled_dot_product_similarity = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

    #scaled_dot_product_similarity 輸出 [batch size, n heads, query len, key len]

    if mask is not None:
      scaled_dot_product_similarity = scaled_dot_product_similarity.masked_fill(mask == 0, -1e10)

    attention = torch.softmax(scaled_dot_product_similarity, dim = -1)
    #attention = [batch size, n heads, query len, key len]

    x = torch.matmul(self.dropout(attention), V)
        
    #x 輸出 [batch size, n heads, query len, head dim]
        
    x = x.permute(0, 2, 1, 3).contiguous()
        
    #x 輸出 [batch size, query len, n heads, head dim]
        
    x = x.view(batch_size, -1, self.hidden_dim)
        
    #x 輸出 [batch size, query len, hid dim]
        
    x = self.full_conn_o(x)
        
    #x 輸出 [batch size, query len, hid dim]
        
    return x, attention

# 實做 PosFeedForwardSubLayer
- 實作 encoder and decoder 同時共用的 PosFeedForward SubLayer 

In [7]:
class PosFeedForwardSubLayer(nn.Module):
  def __init__(self, hidden_dim, ff_dim, dropout):
    super().__init__()
    self.full_conn_1 = nn.Linear(hidden_dim, ff_dim)
    self.full_conn_2 = nn.Linear(ff_dim,  hidden_dim)

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    #x = [batch size, seq len, hid dim]
        
    x = self.dropout(torch.relu(self.full_conn_1(x)))
        
    #x = [batch size, seq len, pf dim]
        
    x = self.full_conn_2(x)
        
    #x = [batch size, seq len, hid dim]
        
    return x


# 實做 SequenceGenerate 
- 處理 序列生成工作
- 叫用 TransformerDecoderLayer
-- 不使用 encoder decoder attention 子層


In [8]:
## 啟動參數
## decoder　Transformer decoder
## dec_pad_idx decoder padding index  
## device 

## 輸入值
## dec_seq 解碼訓練
class SequenceGenerate(nn.Module):
  def __init__(self, decoder, dec_pad_idx, device):
    super().__init__()
    self.decoder = decoder
    self.dec_pad_idx = dec_pad_idx
    self.device = device


  def make_dec_mask(self, dec_seq):
        
    #dec_seq 輸入 [batch size, decoder sequence len]
        
    dec_pad_mask = (dec_seq != self.dec_pad_idx).unsqueeze(1).unsqueeze(2)
        
    #dec_pad_mask 輸出 [batch size, 1, 1, decoder sequence len]
        
    dec_len = dec_seq.shape[1]
        
    dec_sub_mask = torch.tril(torch.ones((dec_len, dec_len), device = self.device)).bool()
        
    #dec_sub_mask 輸出 [decoder sequence len, decoder sequence len]
            
    dec_mask = dec_pad_mask & dec_sub_mask
        
    #dec_mask 輸出 [batch size, 1, decoder sequence len, decoder sequence len]
        
    return dec_mask

  def forward(self, dec_seq):
        
    #dec_seq 輸入　tensor [batch size, trg len]
                
    dec_mask = self.make_dec_mask(dec_seq)
        
    #dec_mask 輸出 [batch size, 1, trg len, trg len]
        
    # 呼叫　transformer decoder 不需要輸入　encoder 相關資訊
    # 也不用接收　encoder decoder attnetion            
    output, _ , decoder_self_attention = self.decoder(dec_seq, None, dec_mask, None)
        
    #output 輸出 [batch size, trg len, output dim]
    #attention 輸出 [batch size, n heads, trg len, src len]
        
    return output, decoder_self_attention

# PTT 資料準備

- 我們的資料來源是 https://github.com/zake7749/Gossiping-Chinese-Corpus
- 詳情請看 github


In [12]:
!wget https://raw.githubusercontent.com/zake7749/Gossiping-Chinese-Corpus/master/data/Gossiping-QA-Dataset-2_0.csv

--2021-02-07 09:01:46--  https://raw.githubusercontent.com/zake7749/Gossiping-Chinese-Corpus/master/data/Gossiping-QA-Dataset-2_0.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61356419 (59M) [text/plain]
Saving to: ‘Gossiping-QA-Dataset-2_0.csv’


2021-02-07 09:01:48 (319 MB/s) - ‘Gossiping-QA-Dataset-2_0.csv’ saved [61356419/61356419]



In [14]:
mv *.csv /content/drive/MyDrive/cupoy/data/

In [16]:
data_dir = '/content/drive/MyDrive/cupoy/data/'
with open(data_dir + 'Gossiping-QA-Dataset-2_0.csv' , encoding='utf-8') as fin:
  csvreader = csv.reader(fin)
  ptt_qa_pairs = [ row for row in csvreader]

print ("Sample: " , ptt_qa_pairs[1000][0:2] )
print ("Total records:" , len(ptt_qa_pairs))

Sample:  ['油價又要噴出了??', '政府：中油臺電內部控管不佳；財團：民營化砍肥貓']
Total records: 774115


# do training test split 如果已經分過了 可以跳過這段

In [17]:

print ("Total records after filtering :" , len(ptt_qa_pairs))
train, val = train_test_split(ptt_qa_pairs, test_size=10000)

print ("training data:{} , develop data: {} ".format(len(train),len(val)))
    
def write_csv(trn_data, file_path ):
    with open(file_path ,'w', newline='', encoding='utf-8') as fout:
        writer = csv.writer (fout)
        for itm in trn_data: 
            writer.writerow ([itm[0] + "|" + itm[1] , itm[0] + "|" + itm[1]] )
            
file_path = data_dir + 'train.csv'
write_csv(train, file_path )

file_path = data_dir + 'val.csv'
write_csv(val, file_path )
    
#file_path = data_dir + 'test.csv'
# write_csv(test, file_path )

Total records after filtering : 774115
training data:764115 , develop data: 10000 


# 資料處理

In [18]:
def tokenize_cmn(text):
  #去掉非中文字元
  regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
  text = text.replace("\\","").split("|")

  return [word for word in regex.sub(text[0],' ') if word.strip()] + ["<sep>"] + [word for word in regex.sub(text[1],' ') if word.strip()]

def tokenize_trg(text):
  #去掉非中文字元
  regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
  text = text.replace("\\","").split("|")

  return ['<pad>' for word in regex.sub(text[0],' ') if word.strip()] + ["<pad>"] + [word for word in regex.sub(text[1],' ') if word.strip()]   

CMN_FIELD = Field(tokenize = tokenize_cmn, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG_FIELD = Field(tokenize = tokenize_trg, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

train_dataset, dev_dataset = TabularDataset.splits(
    path = data_dir , format = 'csv', skip_header = False,
    train='train.csv', validation='val.csv',
    fields=[
        ('qa', CMN_FIELD),
        ('trg', TRG_FIELD)
    ]
)

# 我們要使用的資料格式
- 建立 vocabulary
- qa: ptt 上蒐集的問題和回答 中間用 “sep”隔開
- trg: 我們的訓練目標只有回答的部分，其他的字元（包括“sep”）我們都以 “pad” 取代 , 計算 loss 的時候系統會忽略 ”pad“ token 註記的目標

In [19]:
CMN_FIELD.build_vocab(train_dataset, min_freq = 2)
TRG_FIELD.vocab = CMN_FIELD.vocab
print ("中文語料的字元表長度: " , len(CMN_FIELD.vocab) )
print ("Sample Q and A:", dev_dataset[0].qa)
print ("Sample Target:",  dev_dataset[0].trg  )

中文語料的字元表長度:  6519
Sample Q and A: ['為', '何', '血', '壓', '計', '不', '能', '在', '網', '路', '上', '賣', '?', '<sep>', '哪', '天', '在', '網', '路', '上', '買', '到', '有', '問', '題', '的', '黑', '心', '貨', '就', '不', '要', '出', '來', '開', '記', '者', '會']
Sample Target: ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '哪', '天', '在', '網', '路', '上', '買', '到', '有', '問', '題', '的', '黑', '心', '貨', '就', '不', '要', '出', '來', '開', '記', '者', '會']


# 準備 train_iterator and valid_iterator

In [20]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = BucketIterator.splits(
    (train_dataset, dev_dataset), 
     batch_size = BATCH_SIZE,
     sort_within_batch = True,
     sort_key = lambda x : len(x.qa),
     device = device)

# model training and evaluate function
- 注意 我們要輸入的字和目標要shift 一位 
- 也就是輸入 為', '什', '麼', '淘', '寶', '一', '堆', '賣', '家', '能', '國', '內', '免', '運', '?', '<sep>' --> 希望輸出 '有'
- 輸入 為', '什', '麼', '淘', '寶', '一', '堆', '賣', '家', '能', '國', '內', '免', '運', '?', '<sep>', '有' --> 希望輸出 '的'

In [21]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        qa = batch.qa
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _  = model(qa[:,:-1])
                
        # print (output.shape, trg.shape)
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()

        if i % 1000 == 0: print ("Train Batch:" , i , "Loss:" , loss.item())
        
    

    return epoch_loss / len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            qa = batch.qa
            trg = batch.trg

            
            output, _  = model(qa[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# 實際建立模型
- 設定重要參數
-- 建立一個 hidden embedding 256，三層decoder layer，八個attention heads
-- position wise feedforward 中間層 512 dropout 0.1 learning rate: 0.0005
-- 最長句長 70
- 如果要保留訓練出來的模型，建議和 vocabulary 一起儲存

In [23]:
model_dir =  '/content/drive/My Drive/cupoy/transformer/model/'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

VOC_SIZE = len(CMN_FIELD.vocab)
MAX_SENT_LENGTH = 70
HID_DIM = 256
DEC_LAYERS = 3
DEC_HEADS = 8
DEC_FF_DIM = 512
DEC_DROPOUT = 0.1
LEARNING_RATE = 0.0005

dec = TransformerDecoder(HID_DIM, DEC_FF_DIM,
              DEC_LAYERS, 
              DEC_HEADS,  
              DEC_DROPOUT, 
              VOC_SIZE, MAX_SENT_LENGTH,
              device , skip_encoder_attn = True)

CMN_PAD_IDX = CMN_FIELD.vocab.stoi[CMN_FIELD.pad_token]

#TransformerSequenceGenerate
model = SequenceGenerate(dec, CMN_PAD_IDX, device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = CMN_PAD_IDX)




In [24]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights);

# 實際訓練
- Ｔ4 大約 四分半一個 epoch
- 訓練十個 epoch 就有一定的成績了
- 如果沒時間訓練 也可以下載我們訓練好的權重

In [25]:
!nvidia-smi

Sun Feb  7 09:04:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    27W /  70W |    985MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [26]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = 9999999

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    #torch.save(model.state_dict(), model_dir + 'model-ptt-{}.pt'.format(epoch))
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), model_dir + 'model-ptt-best.pt')

    
    print ("Epoch {} training time: {:.2f} sec Training Loss: {:.3f} , Valiation Loss: {:.3f}".format( epoch , end_time - start_time , train_loss , valid_loss))
 



Train Batch: 0 Loss: 8.81271743774414
Train Batch: 1000 Loss: 4.903470039367676
Train Batch: 2000 Loss: 4.646539211273193
Train Batch: 3000 Loss: 4.509260177612305
Train Batch: 4000 Loss: 4.4734907150268555
Train Batch: 5000 Loss: 4.539374828338623
Epoch 0 training time: 267.01 sec Training Loss: 4.690 , Valiation Loss: 4.285
Train Batch: 0 Loss: 4.341724872589111
Train Batch: 1000 Loss: 4.124035835266113
Train Batch: 2000 Loss: 4.3822221755981445
Train Batch: 3000 Loss: 4.215755939483643
Train Batch: 4000 Loss: 4.091890811920166
Train Batch: 5000 Loss: 4.0102620124816895
Epoch 1 training time: 266.83 sec Training Loss: 4.253 , Valiation Loss: 4.143
Train Batch: 0 Loss: 4.1995954513549805
Train Batch: 1000 Loss: 4.152331352233887
Train Batch: 2000 Loss: 4.133296012878418
Train Batch: 3000 Loss: 4.128142356872559
Train Batch: 4000 Loss: 4.153467178344727
Train Batch: 5000 Loss: 4.1189680099487305
Epoch 2 training time: 265.88 sec Training Loss: 4.145 , Valiation Loss: 4.072
Train Batch:

# 如果要保留訓練出來的模型，建議和 vocabulary 一起儲存

In [29]:
model_dir =  '/content/drive/MyDrive/cupoy/data/'
torch.save(CMN_FIELD.vocab, model_dir + 'vocab.pt')

# 讀取訓練最佳結果
-- 如果下載我們的訓練結果 別忘了讀取 vocabulary

In [30]:
# 保留讀取之前儲存的 vocabulary
CMN_FIELD.vocab = torch.load( model_dir + 'vocab.pt')
TRG_FIELD.vocab = CMN_FIELD.vocab

model_dir =  '/content/drive/MyDrive/cupoy/data/'
model.load_state_dict(torch.load( model_dir + 'model-ptt-best.pt'))
#model.load_state_dict(torch.load(model_dir + 'model-8.pt'))
test_loss = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f}')

FileNotFoundError: ignored

# 使用訓練結果產生回答
- 用模型每一步最佳猜測產生回答

In [None]:

def simple_answer_ptt_question(sentence, qa_field, model, device, max_len = 50):
    
    model.eval()
        
    tokens = [token.lower() for token in sentence]

    tokens = [qa_field.init_token] + tokens + ["<sep>"]
        
    qa_indexes = [qa_field.vocab.stoi[token] for token in tokens]

    qa_tensor = torch.LongTensor(qa_indexes).unsqueeze(0).to(device)
    

    for i in range(max_len):
        qa_tensor = torch.LongTensor(qa_indexes).unsqueeze(0).to(device)
        with torch.no_grad():
            dec_qa, decoder_self_attention  = model(qa_tensor)
        
        pred_token = dec_qa.argmax(2)[:,-1].item()
        qa_indexes.append(pred_token)

        if pred_token == qa_field.vocab.stoi[qa_field.eos_token]:
            break
    
    qa_tokens = [qa_field.vocab.itos[i] for i in qa_indexes]
    answer = "".join(qa_tokens[qa_tokens.index("<sep>")+1:-1])
            
    return answer,  decoder_self_attention

# Fun Time
-- 自己上 ptt 找新的標題來玩吧

In [None]:
question = "日月光 是找老婆的好地方嗎"
question = '長這麼大，做過最壞的事是什麼？'
question = '看到前女友生小孩是什麼感覺'
question = '把中國人惹翻了 會怎麼樣嗎？'
question = '泰國人民為何不推翻王室?'
qa_result, _ = answer_ptt_question(question, CMN_FIELD, model, device, max_len = 50)

print (qa_result)