In [0]:
# !pip install tf-nightly

In [1]:
# prepare dataset

from utility import load_train_data
from collections import Counter
import os
import numpy as np


In [2]:
# import tensorflow

import tensorflow as tf
tf.enable_eager_execution()

print("tf version: ", tf.VERSION)

tf version:  1.13.0-dev20190111


In [3]:
# for reproducability

import random as rn

np.random.seed(42)
rn.seed(12345)
tf.set_random_seed(1234)

In [4]:
# prepare train data

def parse_file(file):
    for line in file:
        line = line.rstrip('\n')
        sentence = line.split(' ')
        yield sentence

# TODO: current method does not allow the model to learn boundary beyond bigram.
def adjust_size(sentences, sentence_size):
    # Increment sentence size for shifting output later
    sentence_size_plus = sentence_size + 1

    for sentence in sentences:
        # Insert BOS = Beginning Of Sentence
        sentence.insert(0, '_BOS/_BOS')

        # Split long sentence allowing overlap of 1 word
        while len(sentence) >= sentence_size_plus:
            yield sentence[:sentence_size_plus]
            sentence = sentence[sentence_size:]

        # Do not yield EOS-only sentence
        if sentence:
            # Insert EOS = End Of Sentence
            sentence.append('_EOS/_EOS')

            if len(sentence) < sentence_size_plus:
                # Padding sentence to make its size sentence_size_plus
                sentence += ['_PAD/_PAD'] * (sentence_size_plus - len(sentence))
            yield sentence
        

def create_vocabulary(sentences, vocabulary_size):
    # Create list of words indexed by word ID
    counter = Counter(word for words in sentences for word in words)
    most_common = counter.most_common(vocabulary_size - 1)
    vocabulary = [word for word, count in most_common]
    vocabulary.insert(0, '_UNK/_UNK')
    return vocabulary


def convert_to_ids(sentences, vocabulary):
    dictionary = dict((word, word_id) for word_id, word in enumerate(vocabulary))

    for sentence in sentences:
        word_ids = []

        for word in sentence:
            if word in dictionary:
                word_id = dictionary[word]
            else:
                word_id = dictionary['_UNK/_UNK']
            word_ids.append(word_id)

        yield word_ids

        
# TODO: current batching ignores sentences that does't fit into last batch.
def create_batches(sentences, batch_size):
    all_batches = int(len(sentences) / batch_size)

    for i in range(all_batches):
        batch_sentences = sentences[i * batch_size:(i + 1) * batch_size]
        batch_input = []
        batch_output = []

        for sentence in batch_sentences:
            # Shift sentence by 1 time step
            input_ = sentence[:-1]
            output_ = sentence[1:]

            batch_input.append(input_)
            batch_output.append(output_)

        yield batch_input, batch_output    

        
def create_pair(sentences):
    
    print("sentences count: ", len(sentences))
#     print(sentences)
    
    input_list = []
    output_list = []
    
    i=0
    for sentence in sentences:
#         print("aaa i: {}", i)
        input_ = sentence[:-1]
        output_ = sentence[1:]
        
        input_list.append(input_)
        output_list.append(output_)
        
        i = i + 1
        
    return input_list, output_list
        
def save_metadata(model_directory, vocabulary):
    # Create directory if not exists
    if not os.path.exists(model_directory):
        os.makedirs(model_directory)

#     # Save settings
#     settings_path = os.path.join(model_directory, 'settings.json')
#     with open(settings_path, 'w') as settings_file:
#         json.dump(vars(args), settings_file, indent=4)

    # Save vocabulary
    vocabulary_path = os.path.join(model_directory, 'vocabulary.txt')
    with open(vocabulary_path, 'w') as vocabulary_file:
        vocabulary_file.write('\n'.join(vocabulary))
        
        
def load_train_data(dataset_name, sentence_size, vocabulary_size, batch_size, model_directory ):

    sentences = parse_file(open(dataset_name))
    sentences = list(adjust_size(sentences, sentence_size))
    vocabulary = create_vocabulary(sentences, vocabulary_size)
    sentences = list(convert_to_ids(sentences, vocabulary))
#     train_data = list(create_batches(sentences, batch_size))
    save_metadata(model_directory, vocabulary)
    
    # target
    input_x, target_y = create_pair(sentences)
    
    return input_x, target_y
#     return train_data


In [6]:
# Load and preprocess training data

dataset_name = "wiki_dataset_mecab_80000.txt"
sentence_size = 30
BATCH_SIZE = 64
batch_size = BATCH_SIZE
vocabulary_size = 50000
model_directory = "models"

input_x, target_y = load_train_data(dataset_name, sentence_size, vocabulary_size, BATCH_SIZE, model_directory)

sentences count:  80103


In [7]:
# check data

print("input_x, count: {}", len(input_x))
print("target_y, count: {}", len(target_y))
print("input_x.0: ", input_x[0])
print("target_y.0: ", target_y[0])


input_x, count: {} 80103
target_y, count: {} 80103
input_x.0:  [3, 37098, 12, 9, 7833, 16536, 12, 6, 131, 16, 1471, 10, 19, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
target_y.0:  [37098, 12, 9, 7833, 16536, 12, 6, 131, 16, 1471, 10, 19, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
# parepare dataset

# BUFFER_SIZE = len(input_x)
BUFFER_SIZE = 50000

dataset = tf.data.Dataset.from_tensor_slices((input_x, target_y)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)


In [9]:
# check dataset again

for input_example, target_example in  dataset.take(1):
    input_x_0 = input_example.numpy()[0]
    print("input_x_0:", input_x_0, input_x_0.shape)
    output_y_0 = target_example.numpy()[0]
    print("output_y_0:", output_y_0, output_y_0.shape)

Instructions for updating:
Colocations handled automatically by placer.
input_x_0: [   3   48   42    5 2936   22 9774   16   33 1561    9 1040   11   24
   27    4    1    1    1    1    1    1    1    1    1    1    1    1
    1    1] (30,)
output_y_0: [  48   42    5 2936   22 9774   16   33 1561    9 1040   11   24   27
    4    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1] (30,)


In [10]:
# prepare model

def gru(units, backword_flg):
    # should use GRU anyway....
    return tf.keras.layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform',
                               go_backwards=backword_flg)
    
    
class KanaKanjiModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units, batch_size):
        super(KanaKanjiModel, self).__init__()
        self.batch_size = batch_size
        self.rnn_units = rnn_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(30,))
        self.gru_f = gru(self.rnn_units, False)
        self.gru_b = gru(self.rnn_units, True)
        self.vocab_size = vocab_size
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.dropout = tf.keras.layers.Dropout(0.5)
        
        
    def call(self, x, hidden, training):
        
        x = self.embedding(x)
        output, state = self.gru_f(x, initial_state = hidden)
        output = self.dropout(output, training=training)
        output = self.fc(output)

        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.rnn_units))
      
      
    def compute_output_shape(self, input_shape):
        # You need to override this function if you want to use the subclassed model
        # as part of a functional-style model.
        # Otherwise, this method is optional.
        
        # maybe we don not need this function...
        
        shape = tf.TensorShape(input_shape).as_list()
        shape[-1] = self.num_classes
        
        return tf.TensorShape([tf.TensorShape([-1, 30, 50000]), tf.TensorShape([-1, 30, 400])])


In [11]:
# create model

hidden_size = 400
embedding_dim = hidden_size

model = KanaKanjiModel(vocabulary_size, embedding_dim, hidden_size, BATCH_SIZE)

In [14]:
# model.summary() is not working...

In [16]:
# speed up
model.call = tf.contrib.eager.defun(model.call)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [17]:
optimizer3 = tf.train.AdamOptimizer()

In [18]:
# check points

checkpoint_dir = './ck_20190222'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer3,
                                 model=model)

In [12]:
# training

import time

# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    # initializing the hidden state at the start of every epoch
    # initally hidden is None
    
    hidden_f = model.initialize_hidden_state()
    # hidden_f = tf.zeros((BATCH_SIZE, hidden_size))
    
    hidden_b = hidden_f
    
    loss2 = 0
    
    for (batch_n, (inp, target)) in enumerate(dataset):
        
        loss1 = 0
        
        with tf.GradientTape() as tape:
            # feeding the hidden state back into the model
            # This is the interesting step
            
            predictions, hidden_f  = model(inp, hidden_f, True)
            
            target = tf.expand_dims(target, 2)
            loss = tf.losses.sparse_softmax_cross_entropy(target, predictions)
            loss1 = tf.reduce_mean(loss)

        loss1_np = float(loss1.numpy())
        batch_loss = (loss1_np / int(inp.shape[1]))
        
        grads = tape.gradient(loss1, model.variables)
        optimizer3.apply_gradients(zip(grads, model.variables))
        
        loss2 = loss2 + batch_loss
        
        if batch_n % 1000 == 0:
            template = 'Epoch {} Batch {} Loss {:.4f}'
            print(template.format(epoch+1, batch_n, loss1))
    
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss2))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))



In [20]:
# save weight

checkpoint.save(file_prefix = checkpoint_prefix)


'./ck_20190209/ckpt-5'

In [24]:
# rebuild moel
hidden_size = 400
embedding_dim = hidden_size
vocabulary_size = 50000

model = KanaKanjiModel(vocabulary_size, embedding_dim, hidden_size, 1)

In [26]:
# load model
# check latest checkpoints

checkpoint_dir3 = './ck_20190222'
tf.train.latest_checkpoint(checkpoint_dir3)

'./ck_20190209/ckpt-5'

In [27]:
# load model
# load weight

checkpoint_dir = './ck_20190222'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer3,
                                 model=model)

status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
print("status: ", status)

status:  <tensorflow.python.training.checkpointable.util.CheckpointLoadStatus object at 0x1288e8128>


In [49]:
# this is based on decode.py of the original scripts

import collections
import heapq
import operator

hiragana_added = []

def load_dictionary(model_directory):
    vocabulary_path = os.path.join(model_directory, 'vocabulary.txt')
    vocabulary = []
    for line in open(vocabulary_path):
        line = line.rstrip('\n')
        target, source = line.split('/', 1)
        vocabulary.append((target, source))

    dictionary = collections.defaultdict(list)
    for i, (target, source) in enumerate(vocabulary):
        dictionary[source].append((target, i))

    return dictionary

def create_lattice(input_, dictionary):
    lattice = [[[] for _ in range(len(input_) + 1)] for _ in range(len(input_) + 2)]
    _, unk_id = dictionary['_UNK'][0]

    for i in range(1, len(input_) + 1):
        for j in range(i):
            key = input_[j:i]
            if key in dictionary:
                for target, word_id in dictionary[key]:
                    lattice[i][j].append((target, word_id))
            elif len(key) == 1:
                # Create _UNK node with verbatim target when single character key is not found in the dictionary.
                lattice[i][j].append((key, unk_id))

    _, eos_id = dictionary['_EOS'][0]
    lattice[-1][-1].append(('', eos_id))
    return lattice


def initialize_queues(lattice, rnn_predictor, dictionary):
    # Initialize priority queues for keeping hypotheses
    # A hypothesis is a tuple of (cost, string, state, prediction)
    # cost is total negative log probability
    # state.shape == [hidden_size * layer_size]
    # prediction.shape == [vocabulary_size]
    
    hiragana_added = []
    
    hidden_f = rnn_predictor.initialize_hidden_state()
    hidden_b = hidden_f
    
    _, bos_id = dictionary['_BOS'][0]
    
    input_x0 = tf.expand_dims([bos_id], 0)
    
    bos_predictions, hidden_f = rnn_predictor(input_x0, hidden_f, False)
    
    
    # logits to probability
    bos_predictions = tf.squeeze(bos_predictions, 0)
    bos_predictions = tf.squeeze(bos_predictions, 0)
    bos_predictions = -1 * tf.nn.log_softmax(bos_predictions, axis=0)
    
    hidden_f = tf.expand_dims(hidden_f, 0)
    
    bos_hypothesis = (0.0, '', hidden_f[0], bos_predictions)
    queues = [[] for _ in range(len(lattice))]
    queues[0].append(bos_hypothesis)
    return queues

def search(lattice, queues, rnn_predictor, beam_size, viterbi_size):
    # Breadth first search with beam pruning and viterbi-like pruning
    for i in range(len(lattice)):
        queue = []

        # create hypotheses without predicting next word
        for j in range(len(lattice[i])):
            for target, word_id in lattice[i][j]:
                
                # if word_id == 350:
                #    print("word_id: ", word_id)
                
                word_queue = []
                for previous_cost, previous_string, previous_state_f, previous_prediction in queues[j]:
                    # if logits is bigger, better.
                    
                    # seems to need give huge priority to first word
                    cost = previous_cost + previous_prediction[word_id]
                    
                    string = previous_string + target
                    hypothesis = (cost, string, word_id, previous_state_f)
                    word_queue.append(hypothesis)
                
                # prune word_queue to viterbi size
                if viterbi_size > 0:
                    word_queue = heapq.nsmallest(viterbi_size, word_queue, key=operator.itemgetter(0))
                    
                queue += word_queue
                
        # prune queue to beam size
        if beam_size > 0:
            queue = heapq.nsmallest(beam_size, queue, key=operator.itemgetter(0))
        
        # predict next word and state before continue
        for cost, string, word_id, previous_state_f in queue:
            
            input_x0 = tf.expand_dims([word_id], 0)
              
            predictions, state_f = rnn_predictor(input_x0, [previous_state_f], False)
            
            # logits to probability
            predictions = tf.squeeze(predictions, 0)
            predictions = tf.squeeze(predictions, 0)
            predictions = -1 * tf.nn.log_softmax(predictions, axis=0)
            # print("predictions.shape: ", predictions.shape)
        
            state_f = tf.expand_dims(state_f, 0)
            hypothesis = (cost, string, state_f[0], predictions)
            queues[i].append(hypothesis)

    return queues

def decode(source, dictionary, rnn_predictor, beam_size, viterbi_size):
    lattice = create_lattice(source, dictionary)
    queues = initialize_queues(lattice, rnn_predictor, dictionary)
    queues = search(lattice, queues, rnn_predictor, beam_size, viterbi_size)

    candidates = []
    for cost, string, _, _ in queues[-1]:
        candidates.append((string, cost))

    top_result = candidates[0][0]
    return top_result, candidates, lattice, queues


In [50]:
def convert_kana_to_kanji(line, rnn_predictor):
    
    # Load settings and vocabulary
    model_directory = "models"
    dictionary = load_dictionary(model_directory)
    
    # debug
    print_nbest = True 
    print_lattice = False
    print_queue = False
    
    # parameters
    beam_size = 5
    viterbi_size = 50000
    
    
    # Iterate input file line by line
    line = line.rstrip('\n')

    # Decode - this might take ~10 seconds per line
    result, candidates, lattice, queues = decode(line, dictionary, rnn_predictor, beam_size, viterbi_size)

    # Print decoded results
    if not print_nbest:
        print(result)
    else:
        for string, cost in candidates:
            print(string, cost)

    # Print lattice for debug
    if print_lattice:
        for i in range(len(lattice)):
            for j in range(len(lattice[i])):
                print('i = {}, j = {}'.format(i, j))
                for target, word_id in lattice[i][j]:
                    print(target, word_id)

    # Print queues for debug
    if print_queue:
        for i, queue in enumerate(queues):
            print('queue', i)
            for cost, string, state_f, prediction in queue:
                # print(string, cost)
                print(string, cost)

In [13]:
# test the model

convert_kana_to_kanji("ぱりのれきし", model)
