In [1]:
import os
import urllib

def download_data(url, force_download=True): 
    fname = url.split("/")[-1]
    if force_download or not os.path.exists(fname):
        urllib.request.urlretrieve(url, fname)
    return fname

url_unidic = "https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip"
unidic = download_data(url_unidic, force_download=False) 
print(unidic)

unidic-cwj-2.3.0.zip


In [2]:
url_neologdic = "https://github.com/neologd/mecab-ipadic-neologd/archive/master.zip"
neologdic = download_data(url_neologdic, force_download=False) 
print(neologdic)

master.zip


In [3]:
import zipfile

with zipfile.ZipFile(unidic) as unidic_zip:
    print(unidic_zip.namelist())
    unidic_zip.extract('unidic-cwj-2.3.0/lex.csv')

['unidic-cwj-2.3.0/AUTHORS', 'unidic-cwj-2.3.0/BSD', 'unidic-cwj-2.3.0/ChaMame for Windows/', 'unidic-cwj-2.3.0/ChaMame for Windows/ChaMame Install guide.pdf', 'unidic-cwj-2.3.0/ChaMame for Windows/ChaMameSetup.msi', 'unidic-cwj-2.3.0/char.bin', 'unidic-cwj-2.3.0/char.def', 'unidic-cwj-2.3.0/COPYING', 'unidic-cwj-2.3.0/dicrc', 'unidic-cwj-2.3.0/feature.def', 'unidic-cwj-2.3.0/GPL', 'unidic-cwj-2.3.0/left-id.def', 'unidic-cwj-2.3.0/lex.csv', 'unidic-cwj-2.3.0/LGPL', 'unidic-cwj-2.3.0/matrix.bin', 'unidic-cwj-2.3.0/matrix.def', 'unidic-cwj-2.3.0/model.bin', 'unidic-cwj-2.3.0/model.def', 'unidic-cwj-2.3.0/rewrite.def', 'unidic-cwj-2.3.0/right-id.def', 'unidic-cwj-2.3.0/sys.dic', 'unidic-cwj-2.3.0/unk.def', 'unidic-cwj-2.3.0/unk.dic']


In [4]:
import csv
import re

unidict = {}
pat = re.compile('^([ァ-ヶー]+)-([\s!-~]+)$')
with open('unidic-cwj-2.3.0/lex.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        m = pat.match(row[11])
        if m:
            unidict[m.group(2)] = m.group(1)
print(len(unidict))

32103


In [5]:
with zipfile.ZipFile(neologdic) as neolog_zip:
    mecab_user_dict_seed = [n for n in neolog_zip.namelist() if 'mecab-user-dict-seed' in n][0]
    neolog_zip.extract(mecab_user_dict_seed)

In [6]:
import lzma

neodict = {}
pat = re.compile('^[\s!-~]+$')
with lzma.open(mecab_user_dict_seed, mode='rt') as f:
    reader = csv.reader(f)
    for row in reader:
        if pat.match(row[0]):
            neodict[row[0]] = row[11]
print(len(neodict))

216588


In [7]:
print(neodict['hey say jump'])

ヘイセイジャンプ


In [8]:
mergedict = dict(unidict)
mergedict.update(neodict)
print(len(mergedict))

245438


In [9]:
words = []
kanas = []

for k, v in mergedict.items():
    words.append(k)
    kanas.append(v)

In [10]:
idx = 1234
print(words[idx], kanas[idx])

Woobat ウーバット


In [11]:
all_chars = set()
for word, kana in zip(words, kanas):
    for c in word:
        all_chars.add(c)
    for c in kana:
        all_chars.add(c)
print(all_chars)
print(len(all_chars))

{'[', 'Y', 'リ', 'ス', 'ク', 'ヤ', '?', 'z', 'シ', 'B', 'G', 'ゼ', 'パ', ')', '5', 'ダ', 'エ', 'モ', '_', 'ワ', 'H', '!', 'R', 'グ', 'o', 'ゾ', '2', '%', '$', 'ヮ', 'F', 'X', 'k', '㋘', 'ガ', '<', '.', 'ル', '{', 'b', 'x', 'ピ', '`', 'ミ', '}', 'ブ', 'プ', 'j', 'y', '|', 'ョ', 'P', 'ソ', 'レ', 'ゴ', '8', 'ウ', 'M', 'ン', 'm', 'フ', 'ネ', 'ト', 'v', ']', '^', 'c', 'ビ', 'ハ', 'f', '>', 'ペ', 'ヨ', 'ー', 'ド', 'O', '*', 'ヰ', 'ェ', 'q', 'I', 'ゲ', 'E', 'コ', 'イ', 'カ', 'w', 'ム', 'ヒ', '0', '(', '@', 'Z', 'テ', 'ベ', '9', 'D', 'ナ', 'メ', 'L', 'u', 'ユ', 'セ', 'ヱ', '-', 'ヂ', 'ジ', "'", '~', 'ッ', 'h', 'S', '/', 'チ', 'ヌ', ';', 'ヲ', 'i', 'ィ', ',', '=', 'ケ', 'T', 'U', 'A', 'ギ', 'オ', 'ヴ', 'n', 'V', 'ャ', '#', 'W', 'ゥ', 'C', 'ザ', 'ヅ', 'e', 'ロ', 'ュ', 's', 'ズ', 'ヘ', '6', '3', 'ノ', 't', 'ア', 'r', 'バ', 'ツ', 'K', 'サ', ':', '+', 'J', '4', '1', 'ニ', '\u3000', 'ラ', 'ァ', 'Q', 'N', 'ホ', 'タ', 'd', 'ボ', ' ', 'マ', 'ポ', 'ォ', 'キ', '&', 'g', 'デ', 'a', '7', 'l', 'p', '\\'}
181


In [12]:
symbol_set = sorted(list(all_chars))

In [13]:
print(symbol_set)

[' ', '!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\u3000', 'ァ', 'ア', 'ィ', 'イ', 'ゥ', 'ウ', 'ェ', 'エ', 'ォ', 'オ', 'カ', 'ガ', 'キ', 'ギ', 'ク', 'グ', 'ケ', 'ゲ', 'コ', 'ゴ', 'サ', 'ザ', 'シ', 'ジ', 'ス', 'ズ', 'セ', 'ゼ', 'ソ', 'ゾ', 'タ', 'ダ', 'チ', 'ヂ', 'ッ', 'ツ', 'ヅ', 'テ', 'デ', 'ト', 'ド', 'ナ', 'ニ', 'ヌ', 'ネ', 'ノ', 'ハ', 'バ', 'パ', 'ヒ', 'ビ', 'ピ', 'フ', 'ブ', 'プ', 'ヘ', 'ベ', 'ペ', 'ホ', 'ボ', 'ポ', 'マ', 'ミ', 'ム', 'メ', 'モ', 'ャ', 'ヤ', 'ュ', 'ユ', 'ョ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ヮ', 'ワ', 'ヰ', 'ヱ', 'ヲ', 'ン', 'ヴ', 'ー', '㋘']


In [14]:
def word_to_symbol_index(word):
    return [symbol_set.index(char) for char in word]

def symbol_index_to_word(indices):
    return [symbol_set[idx] for idx in indices]

In [15]:
idx = 1234
indices_word = word_to_symbol_index(words[idx])
print(indices_word, symbol_index_to_word(indices_word))
indices_kana = word_to_symbol_index(kanas[idx])
print(indices_kana, symbol_index_to_word(indices_kana))

[54, 78, 78, 65, 64, 83] ['W', 'o', 'o', 'b', 'a', 't']
[100, 179, 142, 129, 134] ['ウ', 'ー', 'バ', 'ッ', 'ト']


In [16]:
import numpy as np

dataX = []
for word in words:
    dataX.append(np.array(word_to_symbol_index(word)))

In [17]:
idx = 2048
dataX[idx], symbol_index_to_word(dataX[idx])

(array([85, 68, 81, 77, 72, 66, 68]), ['v', 'e', 'r', 'n', 'i', 'c', 'e'])

In [18]:
dataY =[]
for k in kanas:
    dataY.append(np.array(word_to_symbol_index(k)))

In [19]:
dataY[idx], symbol_index_to_word(dataY[idx])

(array([178, 101, 169, 137, 179, 127, 101]),
 ['ヴ', 'ェ', 'ル', 'ニ', 'ー', 'チ', 'ェ'])

In [20]:
print("SRC: ", symbol_index_to_word(dataX[idx]))
print("TRG: ", symbol_index_to_word(dataY[idx])) 
print("SRC: ", dataX[idx])
print("TRG: ", dataY[idx])

SRC:  ['v', 'e', 'r', 'n', 'i', 'c', 'e']
TRG:  ['ヴ', 'ェ', 'ル', 'ニ', 'ー', 'チ', 'ェ']
SRC:  [85 68 81 77 72 66 68]
TRG:  [178 101 169 137 179 127 101]


In [21]:
def shuffle_together(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

dataX, dataY = np.array(dataX), np.array(dataY)
dataX, dataY = shuffle_together(dataX, dataY)

N = int(len(dataX) * 0.9) # 90%

### First 4 indices are saved for special characters ###

trainX = dataX[:N] + 4
trainY = dataY[:N] + 4

valX = dataX[N:] + 4
valY = dataY[N:] + 4

In [22]:
vocab_dict = {c:i + 4 for i,c in enumerate(symbol_set)}
vocab_dict
PAD_SYMBOL = "<pad>" #0
UNK_SYMBOL = "<unk>" #1
BOS_SYMBOL = "<s>" #2
EOS_SYMBOL = "</s>" #3

VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL]
vocab_dict[PAD_SYMBOL] = 0
vocab_dict[UNK_SYMBOL] = 1
vocab_dict[BOS_SYMBOL] = 2
vocab_dict[EOS_SYMBOL] = 3

In [23]:
import json
with open('vocab.src.json', 'w') as fp:
    json.dump(vocab_dict, fp, indent=4, ensure_ascii=False)
        
with open('vocab.trg.json', 'w') as fp:
    json.dump(vocab_dict, fp, indent=4, ensure_ascii=False)

In [24]:
import multiprocessing 
import logging
import sys
import os

sys.path.append('./SageMaker_seq2seq_WordPronunciation')
from typing import List
from record_pb2 import Record ### record_pb2.py
from create_vocab_proto import write_worker, write_recordio, list_to_record_bytes, read_worker
import struct
import io


ImportError: No module named 'boto3'

In [25]:
sys.path

['/usr/lib/python35.zip',
 '/usr/lib/python3.5',
 '/usr/lib/python3.5/plat-x86_64-linux-gnu',
 '/usr/lib/python3.5/lib-dynload',
 '',
 '/home/kei/.local/lib/python3.5/site-packages',
 '/usr/local/lib/python3.5/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.5/dist-packages/IPython/extensions',
 '/home/kei/.ipython',
 './SageMaker_seq2seq_WordPronunciation']