In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals, print_function, division
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchtext import data
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def tokenize(text):
    return [token for token in list(text)]

TEXT = data.Field(tokenize=tokenize,init_token = '<sos>' ,eos_token = '<eos>', pad_token='<pad>')
fields = [('src',TEXT),('trg',TEXT)]

train_data = data.TabularDataset.splits(path="./data/text/mai2000a_src_trg.tsv", format='tsv', fields=[('source',TEXT),('target',TEXT)])

In [4]:
from sklearn.model_selection import train_test_split
def make_target(src):
    #trg = []
    #for line in src:
        #trg.append(list(line)[1:])
    #return trg_words
    return [line[1:] for line in src]

input_data_path = "./data/text/mai2000a_token.txt"
with  open(input_data_path, 'r', encoding='utf-8') as f:
    input_lines = f.read().split('\n')  # 行ごとのリストに
train_src, val_src = train_test_split(input_lines, train_size=0.9)
val_src, test_src = train_test_split(val_src, train_size=0.5)

train_trg = make_target(train_src)
val_trg = make_target(val_src)
test_trg = make_target(test_src)
print(val_src[0])
print(val_trg[0])

刑事免責は、ロッキード事件で、米国関係者の嘱託尋問調書を有罪立証の証拠として提出したケースがあるが、不起訴を確約した上で作成されたため、最高裁大法廷は９５年２月、丸紅ルートの上告審判決で「刑事免責を与えた供述の証拠採用は認められない」との司法判断を示している。
事免責は、ロッキード事件で、米国関係者の嘱託尋問調書を有罪立証の証拠として提出したケースがあるが、不起訴を確約した上で作成されたため、最高裁大法廷は９５年２月、丸紅ルートの上告審判決で「刑事免責を与えた供述の証拠採用は認められない」との司法判断を示している。


In [5]:
def dataset_from_list(src, trg, field):
    fields = [('src', field), ('trg', field)]
    examples =[]
    for src_line, trg_line in zip(src, trg):
        examples.append(data.Example.fromlist([src_line,trg_line], fields))
    return data.Dataset(examples, fields)

train_data = dataset_from_list(train_src, train_trg, TEXT)
val_data = dataset_from_list(val_src, val_trg, TEXT)
test_data = dataset_from_list(test_src, test_trg, TEXT)

In [6]:
# 辞書作成
TEXT.build_vocab(train_data)

In [11]:
batch_size = 20
val_batch_size = 100
batch_sizes = [batch_size, val_batch_size, val_batch_size]
train_iter, val_iter, test_iter = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.src))

In [8]:
class Embedder(nn.Module):
    '''idで示されている単語をベクトルに変換します'''
    def __init__(self, n_tokens, emb_size):
        super(Embedder, self).__init__()
        self.embeddings = nn.Embedding(n_tokens, emb_size)
        # freeze=Trueによりバックプロパゲーションで更新されず変化しなくなります
    def forward(self, x):
        x_vec = self.embeddings(x)
        return x_vec

In [9]:
emb_size = 300
n_tokens = len(TEXT.vocab.stoi)

In [12]:
batch = next(iter(train_iter))
net1 = Embedder(n_tokens, emb_size)
x = batch.src.transpose(0,1)
x1 = net1(x)  # 単語をベクトルに
print("入力のテンソルサイズ：", x.shape)
print("出力のテンソルサイズ：", x1.shape)

入力のテンソルサイズ： torch.Size([20, 182])
出力のテンソルサイズ： torch.Size([20, 182, 300])


In [13]:
class PositionalEncoder(nn.Module):
    '''入力された単語の位置を示すベクトル情報を付加する'''

    def __init__(self, d_model=300):
        super().__init__()
        self.d_model = d_model  # 単語ベクトルの次元数

        # 単語の順番（pos）と埋め込みベクトルの次元の位置（i）によって一意に定まる値の表をpeとして作成

    def forward(self, x):
        seq_len = x.size(1)
        pe = torch.zeros(seq_len, self.d_model)

        # GPUが使える場合はGPUへ送る、ここでは省略。実際に学習時には使用する
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # pe = pe.to(device)

        for pos in range(seq_len):
            for i in range(0, self.d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.d_model)))
                pe[pos, i + 1] = math.cos(pos /
                                          (10000 ** ((2 * (i + 1))/self.d_model)))

        # 表peの先頭に、ミニバッチ次元となる次元を足す
        pe = pe.unsqueeze(0)

        # 勾配を計算しないようにする
        pe.requires_grad = False
        # 入力xとPositonal Encodingを足し算する
        # xがpeよりも小さいので、大きくする
        ret = math.sqrt(self.d_model)*x + pe
        return ret

In [14]:
# 動作確認

# モデル構築
net2 = PositionalEncoder(d_model=emb_size)

# 入出力
x = batch.src.transpose(0,1)
x1 = net1(x)  # 単語をベクトルに
x2 = net2(x1)

print("入力のテンソルサイズ：", x1.shape)
print("出力のテンソルサイズ：", x2.shape)

入力のテンソルサイズ： torch.Size([20, 182, 300])
出力のテンソルサイズ： torch.Size([20, 182, 300])


In [15]:
class Attention(nn.Module):
    '''Transformerは本当はマルチヘッドAttentionですが、
    分かりやすさを優先しシングルAttentionで実装します'''

    def __init__(self, d_model=300):
        super().__init__()

        # SAGANでは1dConvを使用したが、今回は全結合層で特徴量を変換する
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)

        # 出力時に使用する全結合層
        self.out = nn.Linear(d_model, d_model)

        # Attentionの大きさ調整の変数
        self.d_k = d_model

    def forward(self, q, k, v, mask):
        # 全結合層で特徴量を変換
        k = self.k_linear(k)
        q = self.q_linear(q)
        v = self.v_linear(v)

        # Attentionの値を計算する
        # 各値を足し算すると大きくなりすぎるので、root(d_k)で割って調整
        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_k)

        # ここでmaskを計算
        mask = mask.unsqueeze(1)
        weights = weights.masked_fill(mask == 0, -1e9)

        # softmaxで規格化をする
        normlized_weights = F.softmax(weights, dim=-1)

        # AttentionをValueとかけ算
        output = torch.matmul(normlized_weights, v)

        # 全結合層で特徴量を変換
        output = self.out(output)

        return output, normlized_weights

In [16]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        '''Attention層から出力を単純に全結合層2つで特徴量を変換するだけのユニットです'''
        super().__init__()

        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.dropout(F.relu(x))
        x = self.linear_2(x)
        return x

In [17]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()

        # LayerNormalization層
        # https://pytorch.org/docs/stable/nn.html?highlight=layernorm
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)

        # Attention層
        self.attn = Attention(d_model)

        # Attentionのあとの全結合層2つ
        self.ff = FeedForward(d_model)

        # Dropout
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        # 正規化とAttention
        x_normlized = self.norm_1(x)
        output, normlized_weights = self.attn(
            x_normlized, x_normlized, x_normlized, mask)
        
        x2 = x + self.dropout_1(output)

        # 正規化と全結合層
        x_normlized2 = self.norm_2(x2)
        output = x2 + self.dropout_2(self.ff(x_normlized2))

        return output, normlized_weights


In [18]:
# 動作確認

# モデル構築
net3 = TransformerBlock(d_model=300)

# maskの作成
input_pad = 1  # 単語のIDにおいて、'<pad>': 1 なので
input_mask = (x != input_pad)
print(input_mask[0])

# 入出力
x1 = net1(x)  # 単語をベクトルに
x2 = net2(x1)  # Positon情報を足し算
x3, normlized_weights = net3(x2, input_mask)  # Self-Attentionで特徴量を変換

print("入力のテンソルサイズ：", x2.shape)
print("出力のテンソルサイズ：", x3.shape)
print("Attentionのサイズ：", normlized_weights.shape)

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [27]:
class LinearHead(nn.Module):
    '''Transformer_Blockの出力を使用し、最後にクラス分類させる'''

    def __init__(self, d_model, output_dim):
        super().__init__()

        # 全結合層
        self.linear = nn.Linear(d_model, output_dim)  # output_dimはポジ・ネガの2つ

        # 重み初期化処理
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)

    def forward(self, x):
        # batch length embedding
        output = torch.zeros(x.size(0), x.size(1))
        for i in range(x.size(1)):
            x0 = x[:, i, :]  # 各ミニバッチの各文の先頭の単語の特徴量（300次元）を取り出す
            output[:,:,] = self.linear(x0)
        #　ここをなんとかする 
        return output

In [28]:
# 最終的なTransformerモデルのクラス
class TransformerModel(nn.Module):
    '''Transformerでクラス分類させる'''

    def __init__(self, n_tokens, d_model=300):
        super().__init__()

        # モデル構築
        self.net1 = Embedder(n_tokens, d_model)
        self.net2 = PositionalEncoder(d_model=d_model)
        self.net3_1 = TransformerBlock(d_model=d_model)
        self.net3_2 = TransformerBlock(d_model=d_model)
        self.net4 = LinearHead(output_dim=n_tokens, d_model=d_model)

    def forward(self, x, mask):
        x1 = self.net1(x)  # 単語をベクトルに
        x2 = self.net2(x1)  # Positon情報を足し算
        x3_1, normlized_weights_1 = self.net3_1(
            x2, mask)  # Self-Attentionで特徴量を変換
        x3_2, normlized_weights_2 = self.net3_2(
            x3_1, mask)  # Self-Attentionで特徴量を変換
        x4 = self.net4(x3_2)  # 最終出力
        return x4, normlized_weights_1, normlized_weights_2

In [29]:
n_tokens = len(TEXT.vocab.itos)
emb_size = 300
model = TransformerModel(n_tokens, emb_size)

In [30]:
input_mask = (x != input_pad)

x = batch.src.transpose(0,1)

outputs, normlized_weights_1, normlized_weights_2 = model(x, input_mask)


TypeError: add(): argument 'other' (position 1) must be Tensor, not list

In [34]:
criterion = nn.NLLLoss()
import time

def train_batch(model, iterator, criterion):
    model.train()
    
    lr = 5.0 # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    for batch in iterator:
        src, trg = get_batch(batch)
        src = src.to(device)
        trg = trg.to(device)
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output, trg)  # 損失を計算
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()  # lossの合計を更新
    return total_loss / len(train_data)

def train(model, iterator, optimizer):
    model.train()
    lr = 5.0 # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    for batch in iterator:
        src, trg = get_batch(batch)
        for i in range(batch_size):
            src = src[:,i]
            trg = trg[:,i]
            src = src.to(device)
            trg = trg.to(device)
            optimizer.zero_grad()
            output = model(src)
            loss = criterion(output, trg)  # 損失を計算
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()  # lossの合計を更新
            
    return total_loss / len(train_data)

def evaluate_batch(model, iterator, criterion):
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            for i in range(batch_size):
                src, trg = get_batch(batch, i)
                output = model(src)
                loss = criterion(output, trg).item()
    return loss / len(val_data)

def evaluate(model, iterator, criterion):
    model.eval()
    loss = 0.0
    with torch.no_grad():
        for batch in iterator:
            src, trg = get_batch(batch)
            for i in range(batch_size):
                src = src[:,i]
                trg = trg[:,i]
                src = src.to(device)
                trg = trg.to(device)
                output = model(src)
                loss += criterion(output, trg).item()
    return loss / len(val_data)

def train_iterator(model, num_epochs):
    print("---start---")
    torch.backends.cudnn.benchmark = True
    best_val_loss = float("inf")
    best_model = None
    
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        train_loss = train(model, train_iter, criterion)
        val_loss = evaluate(model, val_iter, criterion)
        print('-' * 89)
        print('end epoch:{:3d}|time:{:5.2f}s'.format(epoch, (time.time - epoch_start_time)))
        print('train loss:{:5.2f} | valid loss:{:5.2f}'.format(train_loss, val_loss))
        print('-' * 89)
        if epoch_loss < best_val_loss:
            best_val_loss = epoch_loss
            best_model = model
        start_time = time.time()
    return best_model

In [31]:
def get_batch(batch):
    src = batch.src[:-1,]
    trg = batch.trg
    return src, trg

In [35]:
train_iterator(model, 3)


test_loss = evaluate(best_model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} '.format(test_loss, math.exp(test_loss)))
print('=' * 89)


---start---


ValueError: Expected target size (149, 4118), got torch.Size([149])

In [None]:
def test(model, sentence):
    outputs = model(sentence)
    return outputs

def testRandomly(model,n=10):
    with torch.no_grad():
        for i in range(n):
            text = random.choice(test_src)
            print('>'+text)
            input_tokens = list(test_src)
            input_tokens.insert(0,'<sos>')
            input_tokens.append('<eos>')
            outputs = test(model, input_tokens)
            for j, output in enumerate(outputs):
                if len(input_tokens) <= j: 
                    break
                topv_list, topi_list = output[0].topk(5)
                print(input_tokens[j] + ":")
                output_tokens = []
                for index in topi_list:
                    output_tokens.append(data.itos[index.item()])
            print(output_tokens)

In [None]:
torch.save(best_model.state_dict(), "./weights/transformer_"+str(epochs)+".h5")