In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import string
import random
import time
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
class Data:
    def __init__(self):
        self.token2index = {"#":0}
        self.token2count = {}
        self.index2token = {0:"#"}
        self.n_tokens = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for token in list(sentence):
            self.addtoken(token)

    def addtoken(self, token):
        if token not in self.token2index:
            self.token2index[token] = self.n_tokens
            self.token2count[token] = 1
            self.index2token[self.n_tokens] = token
            self.n_tokens += 1
        else:
            self.token2count[token] += 1

In [5]:
MAX_LENGTH = 50


def filterPair(p):
    return len(p[0]) < MAX_LENGTH and len(p[1]) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]
    
def prepareData(input_data_path, num_samples=10000):
    with  open(input_data_path, 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')  # 行ごとのリストに
    print("Counting tokens:")
    data = Data()
    target_lines = []
    source_lines = []
    for i,line in enumerate(input_lines):
        line = line.strip()
        if len(line) < 5:
            continue
        input_tokens = list(line)
        target_tokens = input_tokens[1:-1]
        target_lines.append("".join(target_tokens))
        source_lines.append(line)
        data.addSentence(line)
    min_samples = min(num_samples, len(input_lines)-1)
    pairs = [[i,t]for (i,t) in zip(source_lines[:min_samples],target_lines[:min_samples])]
    #pairs.append(source_lines)
    #pairs.append(target_lines)
    pairs = filterPairs(pairs)

    print("Counted tokens:")
    print('data:{}'.format(data.n_tokens))
    return data, pairs

In [6]:
def indexesFromSentence(data, sentence):
    return [data.token2index[token] for token in list(sentence)]

def tensorFromSentence(data, sentence):
    indexes = indexesFromSentence(data, sentence)
    return torch.tensor(indexes, dtype=torch.long, device=device)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(data, pair[0])
    target_tensor = tensorFromSentence(data, pair[1])
    return (input_tensor, target_tensor)

In [7]:
from sklearn.model_selection import train_test_split

input_data_path = "./data/text/mai2000a_token.txt"
#input_data_path = "./data/text/processed/merged_token.txt"
data, pairs = prepareData(input_data_path, 30000000)
train_pairs, val_pairs = train_test_split(pairs, train_size=0.8)
val_pairs, test_pairs = train_test_split(val_pairs, train_size=0.5)
print("train:"+ str(len(train_pairs)))
print("val:"+ str(len(val_pairs)))
print("test:"+ str(len(val_pairs)))
print(random.choice(train_pairs))

Counting tokens:
Counted tokens:
data:4164
train:157069
val:19634
test:19634
['少子化で、国立の教員養成大学や教育学部の教員就職率は低下の一途をたどる。', '子化で、国立の教員養成大学や教育学部の教員就職率は低下の一途をたどる']


In [8]:
class BDLSTMPredictor(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(BDLSTMPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeds = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, input_dim)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.initHidden()

    def forward(self, input, hidden):
        embeds = self.embeds(input)
        lstm_out, hidden = self.lstm(embeds.view(len(input), 1, -1), hidden)
        cbow = self.make_cbow(lstm_out, hidden)
        output = self.linear(cbow)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden
    
    def make_cbow(self, lstm_out, hidden):
        # input_dim, 1, hidden_dim*2
        lstm_out = lstm_out.squeeze()
        # input_dim, hidden_dim*2
        forward, reverse = torch.chunk(lstm_out,2,dim=1)
        # input_dim, hidden_dim
        output = []
        for n in range(1, len(forward)-1):
            output.append(torch.cat([forward[n-1,:], reverse[n+1,:]], dim=0))
        output = torch.stack(output, dim=0)
        # input_dim - 2, hidden_dim
        return output
    
    def initHidden(self):
        return (autograd.Variable(torch.zeros(2, 1, self.hidden_dim)).cuda(),
                autograd.Variable(torch.zeros(2, 1, self.hidden_dim)).cuda())

In [16]:
def train(input_tensor, target_tensor, model, optimizer, criterion, max_length=MAX_LENGTH):
    model.train()
    loss = 0
    hidden = model.initHidden()
    output, hidden = model(input_tensor, hidden)
    if output.size(0) != target_tensor.size(0):
        print(len(output), len(target_tensor))
    loss = criterion(output, target_tensor)
    loss.backward()
    optimizer.step()
    return loss.item()

In [17]:
def validation(input_tensor, target_tensor, model, criterion):
    model.eval()
    with torch.no_grad():
        hidden = model.initHidden()
        loss = 0
        output, hidden = model(input_tensor, hidden)
        loss = criterion(output, target_tensor) 
    return loss.item()

In [18]:
def trainEpochs(model, train_pairs, val_pairs, epochs=1, print_every=10000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_train_losses = []
    plot_val_losses = []
    print_train_loss_total = 0  # Reset every print_every
    print_val_loss_total = 0  # Reset every print_every
    plot_train_loss_total = 0  # Reset every plot_every
    plot_val_loss_total = 0  # Reset every plot_every

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    train_length = len(train_pairs)
    val_length = len(val_pairs)
    for epoch in range(epochs):
        #Train
        print("epoch:",epoch)
        for j, train_pair in enumerate(train_pairs):
            i = j+1
            input_tensor = train_pair[0].squeeze()
            target_tensor = train_pair[1].squeeze()
            loss = train(input_tensor, target_tensor, model, optimizer, criterion)
            print_train_loss_total += loss
            plot_train_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_train_loss_total / print_every
                print_train_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, i / train_length),
                                             i, i / train_length * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_train_loss_total / plot_every
                plot_train_losses.append(plot_loss_avg)
                plot_train_loss_total = 0
        
        #Validation        
        for j, val_pair in enumerate(val_pairs):
            i = j+1
            input_tensor = train_pair[0]
            target_tensor = train_pair[1]
            loss = train(input_tensor, target_tensor, model, optimizer, criterion)
            print_val_loss_total += loss
            plot_val_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_val_loss_total / print_every
                print_val_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, i / val_length),
                                             i, i / val_length * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_val_loss_total / plot_every
                plot_val_losses.append(plot_loss_avg)
                plot_val_loss_total = 0
            

        showPlot(plot_train_losses, "./results/"+str(epoch)+"_train.png")
        showPlot(plot_val_losses, "./results/"+str(epoch)+"val.png")

In [19]:
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
import matplotlib.ticker as ticker
import numpy as np
plt.style.use('ggplot')

def showPlot(points, figure_path):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.savefig(figure_path)

In [20]:
train_length = len(train_pairs)
val_length = len(val_pairs)
training_pairs = [tensorsFromPair(train_pairs[i]) for i in range(train_length)]
validation_pairs = [tensorsFromPair(val_pairs[i]) for i in range(val_length)]

In [21]:
hidden_size = 256
embedding_dim = 256
model = BDLSTMPredictor(data.n_tokens, embedding_dim, hidden_size).to(device)


In [22]:
epochs = 10
trainEpochs(model, training_pairs, validation_pairs,epochs, print_every=1000)

epoch: 0
0m 24s (- 64m 38s) (1000 0%) 6.3068
0m 49s (- 63m 30s) (2000 1%) 6.4872
1m 13s (- 62m 29s) (3000 1%) 7.6076
1m 37s (- 62m 6s) (4000 2%) 9.2518
2m 1s (- 61m 43s) (5000 3%) 11.9199
2m 26s (- 61m 20s) (6000 3%) 15.5791
2m 51s (- 61m 9s) (7000 4%) 22.7322
3m 16s (- 60m 54s) (8000 5%) 31.3412
3m 40s (- 60m 33s) (9000 5%) 44.3634
4m 5s (- 60m 3s) (10000 6%) 64.1995
4m 29s (- 59m 34s) (11000 7%) 85.3763
4m 53s (- 59m 11s) (12000 7%) 112.2885
5m 18s (- 58m 44s) (13000 8%) 138.9999
5m 42s (- 58m 15s) (14000 8%) 168.8112
6m 6s (- 57m 50s) (15000 9%) 213.4909
6m 31s (- 57m 34s) (16000 10%) 247.5961
6m 56s (- 57m 11s) (17000 10%) 282.0239
7m 21s (- 56m 47s) (18000 11%) 318.9210
7m 46s (- 56m 27s) (19000 12%) 365.8254
8m 11s (- 56m 8s) (20000 12%) 412.4976
8m 37s (- 55m 51s) (21000 13%) 455.6107
9m 2s (- 55m 29s) (22000 14%) 494.6629
9m 27s (- 55m 8s) (23000 14%) 536.2443
9m 52s (- 54m 46s) (24000 15%) 574.0963
10m 17s (- 54m 23s) (25000 15%) 611.7957
10m 42s (- 54m 0s) (26000 16%) 662.209

RuntimeError: Locator attempting to generate 29016 ticks from -258.0 to 5545.0: exceeds Locator.MAXTICKS

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x000001FDC4526AE8> (for post_execute):


RuntimeError: Locator attempting to generate 29016 ticks from -258.0 to 5545.0: exceeds Locator.MAXTICKS

In [None]:
def evaluate(model, sentence):
    model.eval()
    with torch.no_grad():
        input_tensor = tensorFromSentence(data, sentence)
        input_length = input_tensor.size()[0]
        hidden = model.initHidden()
        output, hidden = model(input_tensor, hidden)
    return output

In [None]:
def evaluateRandomly(model, n=10):
    with open("./results/BiLSTM_results.txt",'w',encoding='utf-8') as sf:
        for i in range(n):
            pair = random.choice(test_pairs)
            output = evaluate(model, pair[0])
            input_tokens = list(pair[0])
            topv, topi = output.topk(5)
            sf.write(pair[0]+'\n')
            for n,i in enumerate(topi):
                sf.write(str(pair[1][n]) + str([data.index2token[index.item()] for index in i])+'\n')

In [None]:
evaluateRandomly(model, len(test_pairs))

In [None]:
torch.save(model.state_dict(), "./weights/token_level_bd_"+str(epochs)+".h5")