In [1]:
%matplotlib inline

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import string
import random
import time
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#SOS_token = 0
EOS_token = 0

In [3]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
class Data:
    def __init__(self):
        self.token2index = {"#":0}
        self.token2count = {}
        self.index2token = {0:"#"}
        self.n_tokens = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for token in list(sentence):
            self.addtoken(token)

    def addtoken(self, token):
        if token not in self.token2index:
            self.token2index[token] = self.n_tokens
            self.token2count[token] = 1
            self.index2token[self.n_tokens] = token
            self.n_tokens += 1
        else:
            self.token2count[token] += 1

In [5]:
MAX_LENGTH = 50


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(list(p[1])) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]
    
def prepareData(input_data_path, num_samples=10000):
    with  open(input_data_path, 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')  # 行ごとのリストに
    print("Counting tokens:")
    data = Data()
    target_lines = []
    for line in input_lines:
        input_tokens = list(line)
        target_tokens = input_tokens[1:]
        target_tokens.append("#")
        target_lines.append("".join(target_tokens))
        data.addSentence(line)
    min_samples = min(num_samples, min(len(input_lines)-1, len(target_lines)-1))
    pairs = [[i,t]for (i,t) in zip(input_lines[:min_samples],target_lines[:min_samples])]
    pairs.append(input_lines)
    pairs.append(target_lines)
    pairs = filterPairs(pairs)

    print("Counted tokens:")
    print('data:{}'.format(data.n_tokens))
    return data, pairs

In [6]:
from sklearn.model_selection import train_test_split

input_data_path = "./data/text/mai2000a_token.txt"
#input_data_path = "./data/text/mai2000a_normal.txt"
data, pairs = prepareData(input_data_path, 30000000)
train_pairs, val_pairs = train_test_split(pairs, train_size=0.8)
print("train:"+ str(len(train_pairs)))
print("val:"+ str(len(val_pairs)))
print(random.choice(train_pairs))

Counting tokens:
Counted tokens:
data:4164
train:157364
val:39341
['「玉竜」はチャボリュウノヒゲと呼ばれ、ガーデニングの寄せ植え材料としても人気があります。', '玉竜」はチャボリュウノヒゲと呼ばれ、ガーデニングの寄せ植え材料としても人気があります。#']


In [7]:
def indexesFromSentence(data, sentence):
    return [data.token2index[token] for token in list(sentence)]

def tensorFromSentence(data, sentence):
    indexes = indexesFromSentence(data, sentence)
    #indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(data, pair[0])
    target_tensor = tensorFromSentence(data, pair[1])
    return (input_tensor, target_tensor)

In [8]:
class LSTMPredictor(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(LSTMPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeds = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, input_dim)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.initHidden()

    def forward(self, input, hidden):
        embeds = self.embeds(input)
        lstm_out, hidden = self.lstm(embeds.view(len(input), 1, -1), hidden)
        output = self.linear(lstm_out.view(len(input), -1))
        output = self.dropout(output)
        output = self.softmax(output)
        #self.hidden = hidden
        return output, hidden

    def initHidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)).cuda(),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)).cuda())

In [52]:
class BDLSTMPredictor(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(BDLSTMPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeds = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, input_dim)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.initHidden()

    def forward(self, input, hidden):
        embeds = self.embeds(input)
        lstm_out, hidden = self.lstm(embeds.view(len(input), 1, -1), hidden)
        output = self.linear(lstm_out.view(len(input), -1))
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return (autograd.Variable(torch.zeros(2, 1, self.hidden_dim)).cuda(),
                autograd.Variable(torch.zeros(2, 1, self.hidden_dim)).cuda())

In [53]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, model, optimizer, criterion, max_length=MAX_LENGTH):
    model.train()
    hidden = model.initHidden()
    optimizer.zero_grad()
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    loss = 0

    for i in range(input_length):
        output, hidden = model(input_tensor[i], hidden)
        loss += criterion(output, target_tensor[i])
    loss.backward()

    optimizer.step()
    return loss.item() / target_length

In [54]:
def validation(input_tensor, target_tensor, model, criterion):
    model.eval()
    with torch.no_grad():
        hidden = model.initHidden()
        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)
        loss = 0
        for i in range(input_length):
            output, hidden = model(input_tensor[i], hidden)
            loss += criterion(output, target_tensor[i]) 
    return loss.item() / target_length

In [55]:
def trainEpochs(model, epochs=1, print_every=10000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_train_losses = []
    plot_val_losses = []
    print_train_loss_total = 0  # Reset every print_every
    print_val_loss_total = 0  # Reset every print_every
    plot_train_loss_total = 0  # Reset every plot_every
    plot_val_loss_total = 0  # Reset every plot_every

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    train_length = len(train_pairs)
    val_length = len(val_pairs)
    training_pairs = [tensorsFromPair(random.choice(train_pairs)) for i in range(train_length)]
    validation_pairs = [tensorsFromPair(random.choice(val_pairs)) for i in range(val_length)]
    for epoch in range(epochs):
        #Train
        for j, train_pair in enumerate(training_pairs):
            i = j+1
            input_tensor = train_pair[0]
            target_tensor = train_pair[1]
            loss = train(input_tensor, target_tensor, model, optimizer, criterion)
            print_train_loss_total += loss
            plot_train_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_train_loss_total / print_every
                print_train_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, i / train_length),
                                             i, i / train_length * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_train_loss_total / plot_every
                plot_train_losses.append(plot_loss_avg)
                plot_train_loss_total = 0
        
        #Validation        
        for j, val_pair in enumerate(validation_pairs):
            i = j+1
            input_tensor = train_pair[0]
            target_tensor = train_pair[1]
            loss = train(input_tensor, target_tensor, model, optimizer, criterion)
            print_val_loss_total += loss
            plot_val_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_val_loss_total / print_every
                print_val_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, i / val_length),
                                             i, i / val_length * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_val_loss_total / plot_every
                plot_val_losses.append(plot_loss_avg)
                plot_val_loss_total = 0
            

    showPlot(plot_train_losses, "./results/"+str(epochs)+"_train.png")
    showPlot(plot_val_losses, "./results/"+str(epochs)+"val.png")

In [56]:
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
import matplotlib.ticker as ticker
import numpy as np
plt.style.use('ggplot')

def showPlot(points, figure_path):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.savefig(figure_path)

In [57]:
def evaluate(model, sentence, max_length=MAX_LENGTH):
    model.eval()
    with torch.no_grad():
        input_tensor = tensorFromSentence(data, sentence)
        input_length = input_tensor.size()[0]
        hidden = model.initHidden()
        outputs = []
        for i in range(input_length):
            output, hidden = model(input_tensor[i], hidden)
            outputs.append(output)
        return outputs

In [58]:
def evaluateRandomly(model, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        outputs = evaluate(model, pair[0])
        input_tokens = list(pair[0])
        for j, output in enumerate(outputs):
            if len(input_tokens) <= j: 
                break
            topv_list, topi_list = output[0].topk(5)
            print(input_tokens[j] + ":")
            output_tokens = []
            for index in topi_list:
                output_tokens.append(data.index2token[index.item()])
                
            print(output_tokens)

In [59]:
hidden_size = 256
embedding_dim = 256
epochs = 1
model = BDLSTMPredictor(data.n_tokens, embedding_dim, hidden_size).to(device)
trainEpochs(model, epochs, print_every=1000)

2m 20s (- 365m 42s) (1000 0%) 5.8933
3m 58s (- 309m 1s) (2000 1%) 5.2214
5m 33s (- 285m 56s) (3000 1%) 5.0051
7m 10s (- 274m 58s) (4000 2%) 4.9006
8m 49s (- 268m 43s) (5000 3%) 4.7509
10m 26s (- 263m 27s) (6000 3%) 4.6628
12m 6s (- 260m 13s) (7000 4%) 4.5875
13m 44s (- 256m 42s) (8000 5%) 4.5572
15m 24s (- 253m 59s) (9000 5%) 4.5110
17m 1s (- 250m 56s) (10000 6%) 4.4830
18m 40s (- 248m 26s) (11000 6%) 4.4565
20m 18s (- 246m 0s) (12000 7%) 4.4325
21m 58s (- 244m 4s) (13000 8%) 4.3691
23m 34s (- 241m 28s) (14000 8%) 4.2926
25m 13s (- 239m 20s) (15000 9%) 4.3114
26m 52s (- 237m 25s) (16000 10%) 4.2955
28m 32s (- 235m 40s) (17000 10%) 4.2663
30m 11s (- 233m 47s) (18000 11%) 4.2507
31m 50s (- 231m 51s) (19000 12%) 4.2392
33m 28s (- 229m 55s) (20000 12%) 4.2203
35m 6s (- 227m 55s) (21000 13%) 4.1611
36m 41s (- 225m 48s) (22000 13%) 4.1733
38m 18s (- 223m 47s) (23000 14%) 4.1355
39m 55s (- 221m 53s) (24000 15%) 4.1417
41m 31s (- 219m 49s) (25000 15%) 4.0963
43m 7s (- 217m 55s) (26000 16%) 4.1

IndexError: index 33 is out of bounds for dimension 0 with size 33

In [None]:
evaluateRandomly(model, 10)

outputs = evaluate(model, "だが「市場の論理」万能のグローバル化はどうか。")

In [None]:
torch.save(model.state_dict(), "./weights/token_level_bd_"+str(epochs)+".h5")