In [4]:
import os
import urllib.request

def download_data(url, force_download=True): 
    fname = url.split("/")[-1]
    if force_download or not os.path.exists(fname):
        urllib.request.urlretrieve(url, fname)
    return fname

url_unidic = "https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip"
unidic = download_data(url_unidic, force_download=False) 
print(unidic)

unidic-cwj-2.3.0.zip


In [5]:
url_neologdic = "https://github.com/neologd/mecab-ipadic-neologd/archive/master.zip"
neologdic = download_data(url_neologdic, force_download=False) 
print(neologdic)

master.zip


In [7]:
import zipfile

with zipfile.ZipFile(unidic) as unidic_zip:
    print(unidic_zip.namelist())
    unidic_zip.extract('unidic-cwj-2.3.0/lex.csv')

['unidic-cwj-2.3.0/AUTHORS', 'unidic-cwj-2.3.0/BSD', 'unidic-cwj-2.3.0/ChaMame for Windows/', 'unidic-cwj-2.3.0/ChaMame for Windows/ChaMame Install guide.pdf', 'unidic-cwj-2.3.0/ChaMame for Windows/ChaMameSetup.msi', 'unidic-cwj-2.3.0/char.bin', 'unidic-cwj-2.3.0/char.def', 'unidic-cwj-2.3.0/COPYING', 'unidic-cwj-2.3.0/dicrc', 'unidic-cwj-2.3.0/feature.def', 'unidic-cwj-2.3.0/GPL', 'unidic-cwj-2.3.0/left-id.def', 'unidic-cwj-2.3.0/lex.csv', 'unidic-cwj-2.3.0/LGPL', 'unidic-cwj-2.3.0/matrix.bin', 'unidic-cwj-2.3.0/matrix.def', 'unidic-cwj-2.3.0/model.bin', 'unidic-cwj-2.3.0/model.def', 'unidic-cwj-2.3.0/rewrite.def', 'unidic-cwj-2.3.0/right-id.def', 'unidic-cwj-2.3.0/sys.dic', 'unidic-cwj-2.3.0/unk.def', 'unidic-cwj-2.3.0/unk.dic']


In [8]:
import csv
import re

unidict = {}
pat = re.compile('^([ァ-ヶー]+)-([\s!-~]+)$')
with open('unidic-cwj-2.3.0/lex.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        m = pat.match(row[11])
        if m:
            unidict[m.group(2)] = m.group(1)
print(len(unidict))

32103


In [9]:
with zipfile.ZipFile(neologdic) as neolog_zip:
    mecab_user_dict_seed = [n for n in neolog_zip.namelist() if 'mecab-user-dict-seed' in n][0]
    neolog_zip.extract(mecab_user_dict_seed)

In [10]:
import lzma

neodict = {}
pat = re.compile('^[\s!-~]+$')
with lzma.open(mecab_user_dict_seed, mode='rt') as f:
    reader = csv.reader(f)
    for row in reader:
        if pat.match(row[0]):
            neodict[row[0]] = row[11]
print(len(neodict))

216588


In [11]:
print(neodict['hey say jump'])

ヘイセイジャンプ


In [12]:
mergedict = dict(unidict)
mergedict.update(neodict)
print(len(mergedict))

245438


In [13]:
words = []
kanas = []

for k, v in mergedict.items():
    words.append(k)
    kanas.append(v)

In [14]:
idx = 1234
print(words[idx], kanas[idx])

abject アブジェクト


In [15]:
all_chars = set()
for word, kana in zip(words, kanas):
    for c in word:
        all_chars.add(c)
    for c in kana:
        all_chars.add(c)
print(all_chars)
print(len(all_chars))

{'セ', 'B', 'f', 'ネ', ']', 'l', '(', 'v', 'Q', 'L', 'ズ', 'ュ', 'ユ', 'ェ', 'ロ', '?', 'w', 'マ', '%', ')', '@', 'ォ', 'ペ', 'ワ', 'ア', '`', 'ギ', 'グ', '\\', 'ゥ', 'V', 'K', 'k', 'n', 'b', 'キ', 'O', 'Z', 'P', 'ゲ', 'ャ', 'ビ', 'T', 'N', 'ホ', 'ヌ', 'シ', 'ポ', '3', 'ィ', 'ボ', '*', 'チ', ';', 'ョ', '_', '&', 'ツ', ':', 'ゼ', 'ガ', '/', '9', 'S', 'ヴ', '5', 'ヨ', '#', '}', 'ナ', 'ル', "'", 'ヲ', '-', '|', '7', 'z', 'U', ' ', '[', 'デ', 'M', 'ニ', 'u', 'ヂ', 'パ', 'Y', 'テ', 'ジ', ',', 'メ', '㋘', 'c', 'ノ', '4', 'ブ', 'a', 'タ', 'オ', 'F', 't', 'ァ', 'D', 'ソ', 'ケ', '+', 'フ', '1', 'ヘ', '\u3000', 'ト', '.', 'ク', 'x', '~', 'd', '<', 'ミ', 'ス', 'ピ', 'E', 'リ', 'J', '{', 'エ', 'ヒ', 'ヱ', 'ッ', 'ヅ', '>', 'q', 'p', 'm', 'W', 'イ', 'ヮ', 'g', 'ゾ', '!', 'i', 'C', '^', 'ド', 'ウ', 'レ', 'コ', '0', 'H', 'e', 'o', 'y', 'X', 'サ', '=', 'ヤ', 'ベ', 'モ', 'j', 'r', 'A', 'カ', '$', 'R', 'ダ', 'ー', 'プ', 'ム', 'ザ', 'h', 'ゴ', 'ラ', 'G', 'I', '8', '6', 'ヰ', 'バ', 'ハ', 's', '2', 'ン'}
181


In [16]:
symbol_set = sorted(list(all_chars))

In [17]:
print(symbol_set)

[' ', '!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\u3000', 'ァ', 'ア', 'ィ', 'イ', 'ゥ', 'ウ', 'ェ', 'エ', 'ォ', 'オ', 'カ', 'ガ', 'キ', 'ギ', 'ク', 'グ', 'ケ', 'ゲ', 'コ', 'ゴ', 'サ', 'ザ', 'シ', 'ジ', 'ス', 'ズ', 'セ', 'ゼ', 'ソ', 'ゾ', 'タ', 'ダ', 'チ', 'ヂ', 'ッ', 'ツ', 'ヅ', 'テ', 'デ', 'ト', 'ド', 'ナ', 'ニ', 'ヌ', 'ネ', 'ノ', 'ハ', 'バ', 'パ', 'ヒ', 'ビ', 'ピ', 'フ', 'ブ', 'プ', 'ヘ', 'ベ', 'ペ', 'ホ', 'ボ', 'ポ', 'マ', 'ミ', 'ム', 'メ', 'モ', 'ャ', 'ヤ', 'ュ', 'ユ', 'ョ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ヮ', 'ワ', 'ヰ', 'ヱ', 'ヲ', 'ン', 'ヴ', 'ー', '㋘']


In [18]:
def word_to_symbol_index(word):
    return [symbol_set.index(char) for char in word]

def symbol_index_to_word(indices):
    return [symbol_set[idx] for idx in indices]

In [19]:
idx = 1234
indices_word = word_to_symbol_index(words[idx])
print(indices_word, symbol_index_to_word(indices_word))
indices_kana = word_to_symbol_index(kanas[idx])
print(indices_kana, symbol_index_to_word(indices_kana))

[64, 65, 73, 68, 66, 83] ['a', 'b', 'j', 'e', 'c', 't']
[96, 148, 118, 101, 109, 134] ['ア', 'ブ', 'ジ', 'ェ', 'ク', 'ト']


In [20]:
import numpy as np

dataX = []
for word in words:
    dataX.append(np.array(word_to_symbol_index(word)))

In [21]:
idx = 2048
dataX[idx], symbol_index_to_word(dataX[idx])

(array([84, 77, 67, 68, 81, 70, 81, 78, 84, 77, 67]),
 ['u', 'n', 'd', 'e', 'r', 'g', 'r', 'o', 'u', 'n', 'd'])

In [22]:
dataY =[]
for k in kanas:
    dataY.append(np.array(word_to_symbol_index(k)))

In [23]:
dataY[idx], symbol_index_to_word(dataY[idx])

(array([ 96, 177, 126, 179, 110, 167, 100, 177, 135]),
 ['ア', 'ン', 'ダ', 'ー', 'グ', 'ラ', 'ウ', 'ン', 'ド'])

In [24]:
print("SRC: ", symbol_index_to_word(dataX[idx]))
print("TRG: ", symbol_index_to_word(dataY[idx])) 
print("SRC: ", dataX[idx])
print("TRG: ", dataY[idx])

SRC:  ['u', 'n', 'd', 'e', 'r', 'g', 'r', 'o', 'u', 'n', 'd']
TRG:  ['ア', 'ン', 'ダ', 'ー', 'グ', 'ラ', 'ウ', 'ン', 'ド']
SRC:  [84 77 67 68 81 70 81 78 84 77 67]
TRG:  [ 96 177 126 179 110 167 100 177 135]


In [25]:
def shuffle_together(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

dataX, dataY = np.array(dataX), np.array(dataY)
dataX, dataY = shuffle_together(dataX, dataY)

N = int(len(dataX) * 0.9) # 90%

### First 4 indices are saved for special characters ###

trainX = dataX[:N] + 4
trainY = dataY[:N] + 4

valX = dataX[N:] + 4
valY = dataY[N:] + 4

In [26]:
vocab_dict = {c:i + 4 for i,c in enumerate(symbol_set)}
vocab_dict
PAD_SYMBOL = "<pad>" #0
UNK_SYMBOL = "<unk>" #1
BOS_SYMBOL = "<s>" #2
EOS_SYMBOL = "</s>" #3

VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL]
vocab_dict[PAD_SYMBOL] = 0
vocab_dict[UNK_SYMBOL] = 1
vocab_dict[BOS_SYMBOL] = 2
vocab_dict[EOS_SYMBOL] = 3

In [27]:
import json
with open('vocab.src.json', 'w') as fp:
    json.dump(vocab_dict, fp, indent=4, ensure_ascii=False)
        
with open('vocab.trg.json', 'w') as fp:
    json.dump(vocab_dict, fp, indent=4, ensure_ascii=False)

In [36]:
import multiprocessing 
import logging
import sys
import os
import random

sys.path.append('./SageMaker_seq2seq_WordPronunciation')
from typing import List
from record_pb2 import Record ### record_pb2.py
from create_vocab_proto import write_worker, write_recordio, list_to_record_bytes, read_worker
import struct
import io

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [37]:
def write_to_file(np_dataX, np_dataY, file_type, output_file):
    num_read_workers = max(multiprocessing.cpu_count() - 1, 1) 
    logger.info('Spawning %s encoding worker(s) for encoding %s datasets!', str(num_read_workers), file_type) 
    
    q_in = [multiprocessing.Queue() for i in range(num_read_workers)] 
 
    q_out = multiprocessing.Queue() 

    read_process = [multiprocessing.Process(target=read_worker,
                    args=(q_in[i], q_out)) for i in range(num_read_workers)] 
   
    for p in read_process: 
        p.start()

    write_process = multiprocessing.Process(target=write_worker, args=(q_out, output_file)) 
    write_process.start() 
    
    lines_ignored = 0 # No ignored lines in this example. 
    lines_processed = 0
    
    for i, int_source  in enumerate(np_dataX):
        int_source = int_source.tolist()
        int_target = np_dataY[i].tolist()
        item = (int_source, int_target) ### <class 'list'>, <class 'list'>

        if random.random() < 0.0001:
            ### Print some SRC-TRG pairs. 
            print('===   ===   ===   ===   ===')
            print('SRC:', int_source)
            print(len(int_source), type(int_source), type(int_source[0])) # num <class 'list'> <class 'int'>
            print('---   ---   ---   ---   ---')
            print('TRG:', int_target)
            print(len(int_target), type(int_target), type(int_target[0])) # num <class 'list'> <class 'int'>

        q_in[lines_processed % len(q_in)].put(item) 

        lines_processed += 1 
    
    logger.info("""Processed %s lines for encoding to protobuf. %s lines were ignored as they didn't have
                any content in either the source or the target file!""", lines_processed, lines_ignored)
    
    logger.info('Completed writing the encoding queue!')

    for q in q_in: 
        q.put(None) 
    for p in read_process: 
        p.join()
    logger.info('Encoding finished! Writing records to "%s"', output_file)
    q_out.put(None) 
    write_process.join() 
    logger.info('Processed input and saved to "%s"', output_file)
    print('+++---+++---+++---+++---+++')

In [38]:
file_type = 'train'
output_file = "train.rec"
write_to_file(trainX, trainY, file_type, output_file)

INFO:__main__:Spawning 1 encoding worker(s) for encoding train datasets!


===   ===   ===   ===   ===
SRC: [20, 27, 22, 80, 68]
5 <class 'list'> <class 'int'>
---   ---   ---   ---   ---
TRG: [148, 165, 113, 145, 131, 122, 167, 104, 119, 181, 161, 172, 100, 181, 156, 100]
16 <class 'list'> <class 'int'>
===   ===   ===   ===   ===
SRC: [55, 75, 72, 4, 43, 82, 78, 88, 87, 82, 4, 37, 68, 81, 78, 4, 47, 87, 71, 17]
20 <class 'list'> <class 'int'>
---   ---   ---   ---   ---
TRG: [157, 113, 138, 112, 181, 117, 104]
7 <class 'list'> <class 'int'>
===   ===   ===   ===   ===
SRC: [92, 88, 16, 81, 68, 4, 78, 76, 80]
9 <class 'list'> <class 'int'>
---   ---   ---   ---   ---
TRG: [170, 140, 111, 162]
4 <class 'list'> <class 'int'>
===   ===   ===   ===   ===
SRC: [54, 75, 76, 81, 78, 82, 4, 58, 76, 85, 72, 4, 38, 82, 80, 83, 68, 81, 92]
19 <class 'list'> <class 'int'>
---   ---   ---   ---   ---
TRG: [121, 181, 117, 104, 117, 104, 125, 181, 117, 104, 112, 169, 104]
13 <class 'list'> <class 'int'>
===   ===   ===   ===   ===
SRC: [87, 68, 79, 76, 86, 80, 68, 81]
8 <c

INFO:__main__:Processed 220894 lines for encoding to protobuf. 0 lines were ignored as they didn't have
                any content in either the source or the target file!
INFO:__main__:Completed writing the encoding queue!
INFO:__main__:Encoding finished! Writing records to "train.rec"
INFO:__main__:Processed input and saved to "train.rec"


+++---+++---+++---+++---+++


In [39]:
file_type = 'validation'
output_file = "val.rec"
write_to_file(valX, valY, file_type, output_file)

INFO:__main__:Spawning 1 encoding worker(s) for encoding validation datasets!
INFO:__main__:Processed 24544 lines for encoding to protobuf. 0 lines were ignored as they didn't have
                any content in either the source or the target file!
INFO:__main__:Completed writing the encoding queue!
INFO:__main__:Encoding finished! Writing records to "val.rec"
INFO:__main__:Processed input and saved to "val.rec"


+++---+++---+++---+++---+++


In [42]:
bucket = 'sagemaker-word2kana'
prefix = 'seq2seq/word2kana'  

import boto3

def upload_to_s3(bucket, prefix, channel, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = prefix + "/" + channel + '/' + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

upload_to_s3(bucket, prefix, 'train', 'train.rec') 
upload_to_s3(bucket, prefix, 'validation', 'val.rec') 
upload_to_s3(bucket, prefix, 'vocab', 'vocab.src.json') 
upload_to_s3(bucket, prefix, 'vocab', 'vocab.trg.json') 

In [43]:
region_name = boto3.Session().region_name

In [44]:
region_name

'us-east-2'

In [45]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/seq2seq:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/seq2seq:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/seq2seq:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/seq2seq:latest'}
container = containers[region_name]
print('Using SageMaker Seq2Seq container: {} ({})'.format(container, region_name))

Using SageMaker Seq2Seq container: 825641698319.dkr.ecr.us-east-2.amazonaws.com/seq2seq:latest (us-east-2)


In [50]:
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()
source_sequence_length = max([len(w) for w in words])
target_sequence_length = max([len(k) for k in kanas])

job_name = 'seq2seq-wrd-phn-p2-xlarge-' + strftime("%Y-%m-%d-%H-%M", gmtime())
print("Training job", job_name)

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "ResourceConfig": {
        # Seq2Seq does not support multiple machines. Currently, it only supports single machine, multiple GPUs
        "InstanceCount": 1,
        "InstanceType": "ml.p2.xlarge", # We suggest one of ["ml.p2.16xlarge", "ml.p2.8xlarge", "ml.p2.xlarge"]
        "VolumeSizeInGB": 50
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        # Please refer to the documentation for complete list of parameters
        "max_seq_len_source": str(source_sequence_length),
        "max_seq_len_target": str(target_sequence_length),
        "optimized_metric": "bleu", 
        "batch_size": "128", # Please use a larger batch size (256 or 512) if using ml.p2.8xlarge or ml.p2.16xlarge
        "checkpoint_frequency_num_batches": "1000",
        "rnn_num_hidden": "512",
        "num_layers_encoder": "1",
        "num_layers_decoder": "1",
        "num_embed_source": "512",
        "num_embed_target": "512",
        "checkpoint_threshold": "3",
        #"max_num_batches": "2100"
        # Training will stop after 2100 iterations/batches.
        # This is just for demo purposes. Remove the above parameter if you want a better model.
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 48 * 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "vocab",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/vocab/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        }
    ]
}

sagemaker_client = boto3.Session().client(service_name='sagemaker')
sagemaker_client.create_training_job(**create_training_params)

status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)

Training job seq2seq-wrd-phn-p2-xlarge-2018-12-23-08-00
InProgress


In [56]:
### Please keep on checking the status until this says "Completed". ###

status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
# if the job failed, determine why
if status == 'Failed':
    message = sagemaker_client.describe_training_job(TrainingJobName=job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress
