In [1]:
# パッケージのimport
import numpy as np
import random
import math 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 

In [2]:
torch.manual_seed(12)
np.random.seed(12)

In [3]:
"""
embedding
position encoding
dropout

###1Hopping###
Layer Normalization
self_attention
dropout
Layer Normalization
FFN
dropout

Layer Normalization
self_attention
dropout
Layer Normalization
FFN
dropout
############

linear
"""

'\nembedding\nposition encoding\ndropout\n\n###1Hopping###\nLayer Normalization\nself_attention\ndropout\nLayer Normalization\nFFN\ndropout\n\nLayer Normalization\nself_attention\ndropout\nLayer Normalization\nFFN\ndropout\n############\n\nlinear\n'

In [4]:
#埋め込み層
class Embedder(nn.Module):
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()
        
        #更新はしない
        self.embeddings = nn.Embedding.from_pretrained(
            embeddings=text_embedding_vectors, freeze=True)

    def forward(self, x):
        x_vec = self.embeddings(x)

        return x_vec

In [5]:
#PositonalEncoding
class PositionalEncoder(nn.Module):

    def __init__(self, d_model=300, max_seq_len=140):
        super().__init__()

        self.d_model = d_model

        pe = torch.zeros(max_seq_len, d_model)

        # GPUが使える場合はGPUへ送る、ここでは省略。実際に学習時には使用する
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # pe = pe.to(device)

        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos /
                                          (10000 ** ((2 * (i + 1))/d_model)))

        self.pe = pe.unsqueeze(0)

        self.pe.requires_grad = False

    def forward(self, x):
        ret = math.sqrt(self.d_model)*x + self.pe
        return ret


In [6]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, head_num, dropout_rate):
        super().__init__()
        """
        d_model：出力層の次元(head_bumの倍数)
        head_num：ヘッドの数
        dropout_rate
        """
        self.d_model = d_model
        self.head_num = head_num
        self.dropout_rate = dropout_rate
    
        #特徴量変換
        self.q_linear = nn.Linear(d_model, d_model) 
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        #出力の全結合層
        self.out = nn.Linear(d_model, d_model)
        self.attention_dropout_layer = nn.Dropout(dropout_rate)   
    
    def forward(self, q, k, v, mask):
        #key, query, valueを生成
        q = self.q_linear(q) # [batch_size, max_seq_len, d_model]
        k = self.q_linear(k) 
        v = self.q_linear(v)
        
        #head_numに分割
        q = self._split_head(q) # [batch_size, head_num, max_seq_len, d_model/head_num]
        k = self._split_head(k)
        v = self._split_head(v)
        
        print('q.shape', q.size())
        #queryとkeyの関連度の計算と、Scaled Dot-production
        print('k.transpose(2,3).shape', k.transpose(2, 3).shape)
        weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_model)
        print(weights.shape)
        
        #maskをかける
        mask = mask.unsqueeze(1)
        weights = weights.masked_fill(mask==0, -1e9)# [batch_size, head_num, max_seq_len, max_seq_len]
        
        #AttentionWeightを計算
        attention_weight = F.softmax(weights, dim=-1)# [batch_size, head_num, q_length, k_length]
        print(attention_weight.shape)
        
        #AttentionWeightよりvalueから情報を引き出す
        attention_output = torch.matmul(attention_weight, v)# [batch_size, head_num, q_length, d_model/head_num]
        attention_output = self._combine_head(attention_output)
        output = self.out(attention_output)
        
        
        return output, attention_weight
        
    def _split_head(self, x):
        """
        x.size:[batch_size, length, d_model]
        """
        batch_size, length, d_model = x.size()
        x = x.view(batch_size, length, self.head_num, self.d_model//self.head_num) #reshape
        return x.permute(0, 2, 1, 3)
    
    #outputする前に分割したheadを戻す。
    def _combine_head(self, x):
        """
        x.size:[batch_size, head_num, length, d_model//head_num]
        """
        batch_size, _, length, _  = x.size()
        x = x.permute(0, 2, 1, 3)
        return x.reshape(batch_size, length, self.d_model)

In [7]:
"""
class SelfAttention(MultiheadAttention):
    def forward(self, x, mask):
        return super.forward(q=x, k=x, v=x, mask=mask)
"""

'\nclass SelfAttention(MultiheadAttention):\n    def forward(self, x, mask):\n        return super.forward(q=x, k=x, v=x, mask=mask)\n'

In [8]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()

        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        x size=[batch_size, length, d_model]
        return size=[batch_size, length, d_model]
        """
        x = self.linear_1(x)
        x = self.dropout(F.relu(x))
        x = self.linear_2(x)
        return x

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, head_num, dropout=0.1):
        super().__init__()

        # LayerNormalization層
        # https://pytorch.org/docs/stable/nn.html?highlight=layernorm
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        
        # Attention層
        self.attn = MultiheadAttention(d_model, head_num, dropout)

        # Attentionのあとの全結合層2つ
        self.ff = FeedForward(d_model)

        # Dropout
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # SelfAttention
        x_normlized = self.norm_1(x)
        output, normlized_weights = self.attn(
            x_normlized, x_normlized, x_normlized, mask)
        x2 = x + self.dropout_1(output)
        # FFN
        x_normlized2 = self.norm_2(x2)
        output = x2 + self.dropout_2(self.ff(x_normlized2))

        return output, normlized_weights

In [10]:
# 動作確認
def text_to_ids(text_list, vcb):
    result = torch.zeros(140, dtype=torch.long)
    result[0] = vcb['<cls>']
    for i, word in enumerate(text_list):
        if word in vcb:
            result[i+1] = vcb[word]
        else:
            result[i+1] = vcb['<unk>']
    for j in range(i+1, 139):
        result[j+1] = vcb['<pad>']
    return result


import numpy as np
import pickle

x = np.load('omomi.npy')
x = torch.from_numpy(x.astype(np.float32)).clone()
#辞書
itos = pickle.load(open('itos.pkl', 'rb'))
stoi = pickle.load(open('stoi.pkl', 'rb'))


import MeCab

mecab = MeCab.Tagger('-Owakati')
# バッチサイズ分文章とラベルのセットを取り出す
s = 'その棚にある赤いりんごはとてもまずい'
result = [tok for tok in mecab.parse(s).split()]
print(result)
#id列に変換
result = text_to_ids(result, stoi)


# モデル構築
net1 = Embedder(x)
net2 = PositionalEncoder(d_model=300, max_seq_len=140)
net3 = TransformerBlock(d_model=300, head_num=5)

# maskの作成
input_pad = 1  # 単語のIDにおいて、'<pad>': 1 なので
input_mask = (result != input_pad)
print(input_mask)

# 入出力
x1 = net1(result)  # 単語をベクトルに
x2 = net2(x1)  # Positon情報を足し算
x3, normlized_weights = net3(x2, input_mask)  # Self-Attentionで特徴量を変換

print("入力のテンソルサイズ：", x2.shape)
print("出力のテンソルサイズ：", x3.shape)
print("Attentionのサイズ：", normlized_weights.shape)

['その', '棚', 'に', 'ある', '赤い', 'りんご', 'は', 'とても', 'まずい']
tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        F