### What is this?
In #7 we tried to use Seq2Seql but we couldn't figure it out.
Here we're going to use [working example](https://github.com/nicolas-ivanov/debug_seq2seq) step by step, so that we should be able to know what's going on.

### Data
We gonna use very small data set for faster iterations. Once we're done with whole process, we will increase data size.

### word2vec
Here we convert our vocabrary to vector, so that we can have smaller dimension and similarities between words.

In [1]:
## Read data file
import MeCab
EOS_SYMBOL = "$$$"
m = MeCab.Tagger()
m.parse('')

#f = open("/Users/higepon/Dropbox/Machine-Learning/dont_remove_data/webdb.txt")
f = open("/Users/higepon/Dropbox/Machine-Learning/dont_remove_data/shu.txt")

line = f.readline() 

lines = []
while line:
    res = m.parseToNode(line)

    words = []
    while res:
        words.append(res.surface)
        res = res.next
    words.append(EOS_SYMBOL)
    lines.append(words)
    line = f.readline()
print(lines[299])


['', '', '$$$']


In [2]:
from gensim.models import Word2Vec
from itertools import tee

window = 5              # window is the maximum distance between the current and predicted word within a sentence.
min_count = 1           # ignore all words with total frequency lower than this.
max_vocab_size = 20000  # limit RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit (default).
vec_size = 512         # size is the dimensionality of the feature vectors.
workers = 5             # use this many worker threads to train the model (=faster training with multicore machines

wv_model = Word2Vec(window=window,
                 min_count=min_count,
                 max_vocab_size=max_vocab_size,
                 size=vec_size,
                 workers=workers)
print("Word2Vec model", wv_model)

# Using Japanese data file instead
# Input for build_vocab is generator for [['excuse', 'me', '?', '$$$']], where $$$ is EOS and ### is Empty token
#lines = [
#    ["I", "am", "John", ".", "$$$"],
#    ["I", "don't", "want", "to", "spoil", "the", "party", ".", "EOS"]
#]
tokenized_lines, tokenized_lines_for_train, tokenized_lines_for_voc, tokenized_lines_for_nn_train = tee(lines, 4)
wv_model.build_vocab(tokenized_lines)
wv_model.train(tokenized_lines_for_train)

print(wv_model.most_similar(positive=['エンジニア'], negative=[], topn=3))


Word2Vec model Word2Vec(vocab=0, size=512, alpha=0.025)
[('弱体', 0.24591951072216034), ('進捗', 0.24193982779979706), ('当選', 0.2362770140171051)]


In [3]:
from collections import Counter

EMPTY_TOKEN = "###"
token_counter = Counter()
for line in tokenized_lines_for_voc:
    for token in line:
        token_counter.update([token])

token_voc = [token for token, _ in token_counter.most_common()[:max_vocab_size]]
token_voc.append(EMPTY_TOKEN)
index_to_token = dict(enumerate(token_voc))
print(index_to_token)



{0: '', 1: 'の', 2: '、', 3: '$$$', 4: '。', 5: 'は', 6: 'に', 7: 'た', 8: 'が', 9: 'を', 10: 'で', 11: 'て', 12: 'と', 13: 'し', 14: '「', 15: '」', 16: 'も', 17: '教育', 18: 'だ', 19: 'ない', 20: 'こと', 21: '年', 22: 'アーク', 23: 'ニュー', 24: 'い', 25: 'ある', 26: 'れ', 27: 'から', 28: 'いる', 29: '校', 30: 'な', 31: 'する', 32: 'か', 33: 'さ', 34: '・', 35: 'なっ', 36: 'という', 37: '人', 38: '学校', 39: 'たち', 40: 'う', 41: '—', 42: '先生', 43: 'ブッカー', 44: 'だっ', 45: 'ザッカーバーグ', 46: 'ば', 47: '公立', 48: 'です', 49: 'その', 50: '子ども', 51: 'アメリカ', 52: '寄付', 53: '(', 54: 'チャーター', 55: '生徒', 56: 'や', 57: '日本', 58: 'ん', 59: 'よう', 60: 'ドル', 61: 'スクール', 62: '1', 63: '住民', 64: '月', 65: '的', 66: 'なる', 67: 'なかっ', 68: 'しかし', 69: '改革', 70: 'この', 71: 'それ', 72: '2', 73: '貧困', 74: ')', 75: '億', 76: '州', 77: '学区', 78: ':', 79: '者', 80: 'まで', 81: '地域', 82: '知事', 83: '委員', 84: '日', 85: 'たら', 86: 'なく', 87: '話', 88: '子', 89: 'より', 90: '良い', 91: 'せ', 92: '市長', 93: 'ます', 94: '長', 95: '会', 96: 'アンダーソン', 97: 'れる', 98: 'クリス', 99: 'なら', 100: 'さん', 101: '金', 102: 'ため',

### The model

In [4]:
from seq2seq.models import SimpleSeq2Seq
from keras.models import Sequential

print("token_size", len(index_to_token))
INPUT_SEQUENCE_LENGTH = 64 #16
OUTPUT_SEQUENCE_LENGTH = 32 #6
#seq2seq = Sequential()
model = SimpleSeq2Seq(
    input_dim=vec_size,
    input_length=INPUT_SEQUENCE_LENGTH,
    hidden_dim=512,
    output_dim=len(index_to_token),
    output_length=OUTPUT_SEQUENCE_LENGTH,
    depth=3
)

#model.add(seq2seq)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print("model compile done")

Using TensorFlow backend.


token_size 4254
model compile done


In [5]:
import numpy as np
# X_train.shape (32, 16, 256) = (len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE)
# Y_train.shape ?
num_batch = 1500
X = np.zeros((num_batch, INPUT_SEQUENCE_LENGTH, vec_size), dtype=np.float)
Y = np.zeros((num_batch, OUTPUT_SEQUENCE_LENGTH, len(index_to_token)), dtype=np.float)

token_to_index = dict(zip(index_to_token.values(), index_to_token.keys()))

prev_line = []
for i, line in enumerate(tokenized_lines_for_nn_train):
    if i == num_batch:
        break
    sentence = np.zeros(INPUT_SEQUENCE_LENGTH)
    osentence = np.zeros(OUTPUT_SEQUENCE_LENGTH)
    for j, word in enumerate(prev_line[:INPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            X[i][j] = np.array(wv_model[word])
        else:
            print(word)
    # todo ずらす
    for j, word in enumerate(line[:OUTPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            Y[i][j] = np.array(token_to_index[word])
        else:
            print(word)            
    i = i + 1
    prev_line = line
        
print(X)        
#        X[i][j] = word
#    X[i] = np.asarray(line[:INPUT_SEQUENCE_LENGTH])


model.fit(X, Y, batch_size=50, nb_epoch=5, show_accuracy=True, verbose=1, validation_split=0.05)

[[[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]
  [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]
  [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]
  ..., 
  [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]
  [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]
  [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
     0.00000000e+00   0.00000000e+00]]

 [[  2.15280000e-02   6.57147961e-03   3.63758579e-02 ...,  -4.82545607e-02
     5.59080616e-02  -4.05435544e-03]
  [ -4.19456977e-04  -8.01150047e-04  -5.95326419e-04 ...,  -4.63913195e-04
     6.22487685e-04  -1.88245918e-04]
  [  5.73351746e-04  -7.77725887e-04   2.22255418e-04 ...,  -4.84864286e-05
  

`model.compile(optimizer, loss, metrics=["accuracy"])`


Train on 1425 samples, validate on 75 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x193878780>

In [8]:
def predict(text):
    res = m.parseToNode(text)
    words = []
    while res:
        words.append(res.surface)
        res = res.next
    words.append(EOS_SYMBOL)
    X = np.zeros((1, INPUT_SEQUENCE_LENGTH, vec_size), dtype=np.float)
    for j, word in enumerate(words[:INPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            X[0][j] = np.array(wv_model[word])
    t = model.predict(X)[0]
    indexes = np.argmax(t, axis=1)
    return ''.join([index_to_token[index] for index in indexes])
    
print(predict("ここで問題になったのは以下のことである。"))


    

買わ中原高める者弁弁測るPublic母親から教師呼ばわり
