In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals, print_function, division
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchtext import data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def tokenize(text):
    return [token for token in list(text)]

TEXT = data.Field(tokenize=tokenize,init_token = '<sos>' ,eos_token = '<eos>', pad_token='<pad>')
fields = [('src',TEXT),('trg',TEXT)]

train_data = data.TabularDataset.splits(path="./data/text/mai2000a_src_trg.tsv", format='tsv', fields=[('source',TEXT),('target',TEXT)])

In [4]:
from sklearn.model_selection import train_test_split
def make_target(src):
    #trg = []
    #for line in src:
        #trg.append(list(line)[1:])
    #return trg_words
    return [line[1:] for line in src]

input_data_path = "./data/text/mai2000a_token.txt"
with  open(input_data_path, 'r', encoding='utf-8') as f:
    input_lines = f.read().split('\n')  # 行ごとのリストに
train_src, val_src = train_test_split(input_lines, train_size=0.9)
val_src, test_src = train_test_split(val_src, train_size=0.5)

train_trg = make_target(train_src)
val_trg = make_target(val_src)
test_trg = make_target(test_src)
print(val_src[0])
print(val_trg[0])

ジャズのアート化、コンサート化を推進したと近年かなり激しい批判が飛び交ったが、それが白人アーティストからだけ出ている点に、私は一抹の寂しさを覚える。
ャズのアート化、コンサート化を推進したと近年かなり激しい批判が飛び交ったが、それが白人アーティストからだけ出ている点に、私は一抹の寂しさを覚える。


In [5]:
def dataset_from_list(src, trg, field):
    fields = [('src', field), ('trg', field)]
    examples =[]
    for src_line, trg_line in zip(src, trg):
        examples.append(data.Example.fromlist([src_line,trg_line], fields))
    return data.Dataset(examples, fields)

train_data = dataset_from_list(train_src, train_trg, TEXT)
val_data = dataset_from_list(val_src, val_trg, TEXT)
test_data = dataset_from_list(test_src, test_trg, TEXT)

In [6]:
# 辞書作成
TEXT.build_vocab(train_data)

In [7]:
batch_size = 20
val_batch_size = 100
batch_sizes = [batch_size, val_batch_size, val_batch_size]
train_iter, val_iter, test_iter = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.src))

In [8]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout = 0.1, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()*(- math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

In [9]:
class Embedder(nn.Module):
    def __init__(self, n_tokens, emb_size):
        super(Embedder, self).__init__()
        self.embedding = nn.Embedding(n_tokens, emb_size)
        self.init_weights()
    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
    def forward(self, src):
        output = self.embedding(src)
        return output

In [10]:
class PosisionalEncoder(nn.Module):
    def __init__(self, n_input, dropout = 0.5):
        super(PosisionalEncoder, self).__init__()
        self.pos_encoder = PositionalEncoding(n_input, dropout)
        self.n_input = n_input
    def forward(self, src):
        src = src * math.sqrt(self.n_input)
        output = self.pos_encoder(src)
        return output

In [11]:
class TransformerBlock(nn.Module):
    def __init__(self, n_input, n_head, n_hidden, n_layers, dropout = 0.5):
        super(TransformerBlock, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        encoder_layers = TransformerEncoderLayer(n_input, n_head, n_hidden, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
    
    def _generate_square_subsequent_mask (self, sz):
        mask = (torch.triu(torch.ones (sz, sz )) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask
        output = self.transformer_encoder(src, self.src_mask)
        return output

In [23]:
class LinearHead(nn.Module):
    def __init__(self, emb_size, n_tokens, dropout = 0.5):
        super(LinearHead, self).__init__()
        self.linear = nn.Linear(emb_size, n_tokens)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, src):
        output = self.linear(src)
        output = self.dropout(output)
        output = self.softmax(output)
        return output

In [29]:
class TransformerModel(nn.Module):
    def __init__(self, n_token, emb_size, n_head, n_hidden, n_layers, dropout = 0.5):
        super(TransformerModel, self).__init__()
        self.embedding = Embedder(n_tokens, emb_size)
        self.pos_encoder = PosisionalEncoder(emb_size, dropout)
        self.transformer = TransformerBlock(emb_size, n_head, n_hidden, n_layers, dropout)
        self.linear = LinearHead(emb_size, n_tokens, dropout)

    def forward(self, src):
        output = self.embedding(src)
        output = self.pos_encoder(output)
        output = self.transformer(output)
        output = self.linear(output)
        return output

In [30]:
n_tokens = len(TEXT.vocab.stoi) # the size of vocabulary
emb_size = 200 # embedding dimension
n_hidden = 256 # the dimension of the feedforward network model in nn.TransformerEncoder
n_layers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
n_head = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value

In [31]:
def get_batch(batch):
    src = batch.src[:-1,]
    trg = batch.trg
    return src, trg

In [32]:
emb = Embedder(n_tokens, emb_size).to(device)
pos = PosisionalEncoder(emb_size, dropout).to(device)
trf = TransformerBlock(emb_size, n_head, n_hidden, n_layers, dropout).to(device)
lnh = LinearHead(emb_size, n_tokens, dropout).to(device)

batch = next(iter(train_iter))
src, trg = get_batch(batch)
src= src.to(device)
trg = trg.to(device)
print("バッチのテンソルサイズ：src:{}|trg:{}".format(src.size(),trg.size()))
output = emb(src)
print("入力のテンソルサイズ：", output.shape)
output = pos(output)
output = trf(output)
#output = lnh(output)
print("出力のテンソルサイズ：", output.shape)

バッチのテンソルサイズ：src:torch.Size([208, 20])|trg:torch.Size([208, 20])
入力のテンソルサイズ： torch.Size([208, 20, 200])
出力のテンソルサイズ： torch.Size([208, 20, 200])


In [33]:
model = TransformerModel(n_tokens, emb_size, n_head, n_hidden, n_layers, dropout).to(device)

In [34]:
criterion = nn.NLLLoss()
import time

def train_batch(model, iterator, criterion):
    model.train()
    
    lr = 5.0 # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    for batch in iterator:
        src, trg = get_batch(batch)
        src = src.to(device)
        trg = trg.to(device)
        optimizer.zero_grad()
        output = model(src)
        loss = criterion(output, trg)  # 損失を計算
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()  # lossの合計を更新
    return total_loss / len(train_data)

def train(model, iterator, optimizer):
    model.train()
    lr = 5.0 # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    for batch in iterator:
        src, trg = get_batch(batch)
        for i in range(batch_size):
            src = src[:,i]
            trg = trg[:,i]
            src = src.to(device)
            trg = trg.to(device)
            optimizer.zero_grad()
            output = model(src)
            loss = criterion(output, trg)  # 損失を計算
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()  # lossの合計を更新
            
    return total_loss / len(train_data)

def evaluate_batch(model, iterator, criterion):
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            for i in range(batch_size):
                src, trg = get_batch(batch, i)
                output = model(src)
                loss = criterion(output, trg).item()
    return loss / len(val_data)

def evaluate(model, iterator, criterion):
    model.eval()
    loss = 0.0
    with torch.no_grad():
        for batch in iterator:
            src, trg = get_batch(batch)
            for i in range(batch_size):
                src = src[:,i]
                trg = trg[:,i]
                src = src.to(device)
                trg = trg.to(device)
                output = model(src)
                loss += criterion(output, trg).item()
    return loss / len(val_data)

def train_iterator(model, num_epochs):
    print("---start---")
    torch.backends.cudnn.benchmark = True
    best_val_loss = float("inf")
    best_model = None
    
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        train_loss = train(model, train_iter, criterion)
        val_loss = evaluate(model, val_iter, criterion)
        print('-' * 89)
        print('end epoch:{:3d}|time:{:5.2f}s'.format(epoch, (time.time - epoch_start_time)))
        print('train loss:{:5.2f} | valid loss:{:5.2f}'.format(train_loss, val_loss))
        print('-' * 89)
        if epoch_loss < best_val_loss:
            best_val_loss = epoch_loss
            best_model = model
        start_time = time.time()
    return best_model

In [35]:
train_iterator(model, 3)


test_loss = evaluate(best_model, test_iter, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} '.format(test_loss, math.exp(test_loss)))
print('=' * 89)


---start---


ValueError: Expected target size (149, 4118), got torch.Size([149])

In [None]:
def test(model, sentence):
    outputs = model(sentence)
    return outputs

def testRandomly(model,n=10):
    with torch.no_grad():
        for i in range(n):
            text = random.choice(test_src)
            print('>'+text)
            input_tokens = list(test_src)
            input_tokens.insert(0,'<sos>')
            input_tokens.append('<eos>')
            outputs = test(model, input_tokens)
            for j, output in enumerate(outputs):
                if len(input_tokens) <= j: 
                    break
                topv_list, topi_list = output[0].topk(5)
                print(input_tokens[j] + ":")
                output_tokens = []
                for index in topi_list:
                    output_tokens.append(data.itos[index.item()])
            print(output_tokens)

In [None]:
torch.save(best_model.state_dict(), "./weights/transformer_"+str(epochs)+".h5")