In [None]:
from WikiIterator import wiki_sentences

In [None]:
# データを用意する。学習用、検証用、テスト用にそれぞれ24k, 3k, 3k文。
train_size = 24000
valid_size = 3000
test_size = 3000

gen = wiki_sentences()
for filename, size in zip(
    ['sp/wiki_train.txt', 'sp/wiki_valid.txt', 'sp/wiki_test.txt'],
    [train_size, valid_size, test_size]):
    
    i = 0
    output = ''
    for text in gen:
        output += text + '\n'
        i += 1
        if i >= size:
            with open(filename,  'w', encoding='utf-8') as f:
                f.write(output)
            break

In [None]:
# 下記コマンドを走らせてSentencePieceのモデルを用意しておく
# spm_train --input=sp/wiki_train.txt --model_prefix=sp/sp --vocab_size=8000

In [None]:
# 学習したSentencePieceモデルを読み込む
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('sp/sp.model')

In [None]:
# コーパスの頭を覗き見てみる
i = 0
for text in wiki_sentences():
    print(' '.join(sp.EncodeAsPieces(text)))
    i += 1
    if i >= 5: break

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable

import numpy as np
import pandas as pd
import glob
import os
import time
import math

use_gpu = torch.cuda.is_available()

In [None]:
# wakati_corpus.txtから一部データを読み込む
class Corpus(object):
    def __init__(self):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load('sp/sp.model')

        self.train = self.load_from_file('sp/wiki_train.txt')
        self.valid = self.load_from_file('sp/wiki_valid.txt')
        self.test = self.load_from_file('sp/wiki_test.txt')

        self.train = self.assign_ids(self.train)
        self.valid = self.assign_ids(self.valid)
        self.test = self.assign_ids(self.test)
    
    def load_from_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            return f.read().split('\n')
    
    def assign_ids(self, texts):
        tokens = []
        for text in texts:
            tokens.extend(self.sp.EncodeAsIds(text))
        
        ids = torch.from_numpy(np.array(tokens, dtype=np.int64))
        return ids

In [None]:
# RNNモデルの定義
class RNNModel(nn.Module):

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

In [None]:
# コーパスを実際に読み込む
corpus = Corpus()

In [None]:
# a b c d e ... x y z というdataで、バッチサイズが4の場合、batchify関数を適用した結果は次のようになる。
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if use_gpu:
        data = data.cuda()
    return data

In [None]:
# 各データをバッチに分割
batch_size = 20
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [None]:
# 今回試してみるモデルの設定
emsize=200
nhid=200
nlayers=1
dropout=0.5
tied=False
ntokens = corpus.sp.GetPieceSize()
model = RNNModel('GRU', ntokens, emsize, nhid, nlayers, dropout, tied)
if use_gpu:
    model.cuda()

criterion = nn.CrossEntropyLoss()

In [None]:
# トレーニング
bptt = 35 # 最大35単語までのシーケンスを扱う
lr = 20
clip = 0.25
log_interval = 200

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
def get_batch(source, i, evaluation=False):
    seq_len = min(bptt, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = corpus.sp.GetPieceSize()
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, bptt):
        data, targets = get_batch(data_source, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)

def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = corpus.sp.GetPieceSize()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
            
# Loop over epochs.
lr = lr
best_val_loss = None

In [None]:
# At any point you can hit Ctrl + C to break out of training early.
epochs = 100
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open('model.pt', 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

In [None]:
# Load the best saved model.
with open('model.pt', 'rb') as f:
    model = torch.load(f)

In [None]:
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [None]:
# ランダムな文章を生成してみる
temperature=1.0
for j in range(10):
    # 隠れ層はランダムに初期化
    hidden = model.init_hidden(1)
    
    # ▁から始める
    input = Variable(torch.LongTensor([[corpus.sp.PieceToId('▁')]]), volatile=True).cuda()
    model = model
    results = []
    for i in range(100):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(temperature).exp()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.data.fill_(word_idx)
        word = corpus.sp.IdToPiece(word_idx)

        # ▁が出たら1文終わり
        if word == '▁':
            break

        results.append(word)

    print(' '.join(results).replace('▁', '\n'))