### What is this?
In #7 we tried to use Seq2Seql but we couldn't figure it out.
Here we're going to use [working example](https://github.com/nicolas-ivanov/debug_seq2seq) step by step, so that we should be able to know what's going on.

### Data
We gonna use very small data set for faster iterations. Once we're done with whole process, we will increase data size.

### word2vec
Here we convert our vocabrary to vector, so that we can have smaller dimension and similarities between words.

In [1]:
## Read data file
import MeCab
EOS_SYMBOL = "$$$"
m = MeCab.Tagger()
m.parse('')

f = open("/Users/higepon/Dropbox/Machine-Learning/dont_remove_data/webdb.txt")
line = f.readline() 
lines = []
while line:
    res = m.parseToNode(line)

    words = []
    while res:
        words.append(res.surface)
        res = res.next
    words.append(EOS_SYMBOL)
    lines.append(words)
    line = f.readline()
print(lines[299])


['', '\u3000', '週', 'に', '一', '度', 'の', '週', '次', 'レビュー', 'で', '、', '上記', 'の', 'リスト', 'の', 'アイテム', 'を', 'おおよそ', '次', 'の', 'よう', 'に', '分類', 'する', '。', '', '$$$']


In [2]:
from gensim.models import Word2Vec
from itertools import tee

window = 5              # window is the maximum distance between the current and predicted word within a sentence.
min_count = 1           # ignore all words with total frequency lower than this.
max_vocab_size = 20000  # limit RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit (default).
vec_size = 256          # size is the dimensionality of the feature vectors.
workers = 5             # use this many worker threads to train the model (=faster training with multicore machines

wv_model = Word2Vec(window=window,
                 min_count=min_count,
                 max_vocab_size=max_vocab_size,
                 size=vec_size,
                 workers=workers)
print("Word2Vec model", wv_model)

# Using Japanese data file instead
# Input for build_vocab is generator for [['excuse', 'me', '?', '$$$']], where $$$ is EOS and ### is Empty token
#lines = [
#    ["I", "am", "John", ".", "$$$"],
#    ["I", "don't", "want", "to", "spoil", "the", "party", ".", "EOS"]
#]
tokenized_lines, tokenized_lines_for_train, tokenized_lines_for_voc, tokenized_lines_for_nn_train = tee(lines, 4)
wv_model.build_vocab(tokenized_lines)
wv_model.train(tokenized_lines_for_train)

print(wv_model.most_similar(positive=['エンジニア'], negative=[], topn=3))


Word2Vec model Word2Vec(vocab=0, size=256, alpha=0.025)
[('。', 0.9663853049278259), ('に', 0.9662169218063354), ('こと', 0.9649181962013245)]


In [3]:
from collections import Counter

EMPTY_TOKEN = "###"
token_counter = Counter()
for line in tokenized_lines_for_voc:
    for token in line:
        token_counter.update([token])

token_voc = [token for token, _ in token_counter.most_common()[:max_vocab_size]]
token_voc.append(EMPTY_TOKEN)
index_to_token = dict(enumerate(token_voc))
print(index_to_token)



{0: '', 1: '。', 2: 'の', 3: '$$$', 4: 'を', 5: 'に', 6: 'て', 7: 'が', 8: 'は', 9: '、', 10: 'で', 11: 'と', 12: 'し', 13: 'こと', 14: 'ない', 15: 'た', 16: 'する', 17: 'う', 18: 'も', 19: 'ある', 20: 'だ', 21: '\u3000', 22: 'いる', 23: 'か', 24: 'な', 25: '」', 26: '「', 27: '1', 28: 'ば', 29: 'だろ', 30: '-', 31: '的', 32: 'から', 33: 'よう', 34: '勉強', 35: 'さ', 36: 'ユーザー', 37: 'い', 38: 'なる', 39: 'プロダクト', 40: '#', 41: 'など', 42: 'その', 43: '時間', 44: '自分', 45: 'という', 46: '人', 47: 'チーム', 48: '筆者', 49: '/', 50: '何', 51: 'れ', 52: 'もの', 53: 'や', 54: 'よい', 55: 'マネージャ', 56: '）', 57: '問題', 58: 'エンジニア', 59: 'しよ', 60: 'たい', 61: '（', 62: 'ので', 63: '良い', 64: 'できる', 65: '常に', 66: 'この', 67: 'つ', 68: 'たり', 69: 'これ', 70: 'れる', 71: '思う', 72: '必要', 73: 'かも', 74: '場合', 75: '>', 76: 'しれ', 77: 'なく', 78: 'テスト', 79: ':', 80: '性', 81: '機能', 82: '次', 83: 'それ', 84: 'なっ', 85: '.', 86: 'しまう', 87: 'プロダクティビティ', 88: 'たら', 89: 'ため', 90: 'A', 91: 'そこ', 92: 'とき', 93: '2', 94: '？', 95: 'として', 96: '可能', 97: '技術', 98: 'まで', 99: 'B', 100: '<', 101: 'カバン', 102

### The model

In [4]:
from seq2seq.models import SimpleSeq2Seq
from keras.models import Sequential

print("token_size", len(index_to_token))
INPUT_SEQUENCE_LENGTH = 16
OUTPUT_SEQUENCE_LENGTH = 6
#seq2seq = Sequential()
model = SimpleSeq2Seq(
    input_dim=vec_size,
    input_length=INPUT_SEQUENCE_LENGTH,
    hidden_dim=512,
    output_dim=len(index_to_token),
    output_length=OUTPUT_SEQUENCE_LENGTH,
    depth=1
)

#model.add(seq2seq)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print("model compile done")

Using TensorFlow backend.


token_size 2551
model compile done


In [5]:
import numpy as np
# X_train.shape (32, 16, 256) = (len(sents_batch), INPUT_SEQUENCE_LENGTH, TOKEN_REPRESENTATION_SIZE)
# Y_train.shape ?
num_batch = 700
X = np.zeros((num_batch, INPUT_SEQUENCE_LENGTH, vec_size), dtype=np.float)
Y = np.zeros((num_batch, OUTPUT_SEQUENCE_LENGTH, len(index_to_token)), dtype=np.float)

token_to_index = dict(zip(index_to_token.values(), index_to_token.keys()))

prev_line = []
for i, line in enumerate(tokenized_lines_for_nn_train):
    if i == num_batch:
        break
    sentence = np.zeros(INPUT_SEQUENCE_LENGTH)
    osentence = np.zeros(OUTPUT_SEQUENCE_LENGTH)
    for j, word in enumerate(prev_line[:INPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            X[i][j] = np.array(wv_model[word])
        else:
            print(word)
    # todo ずらす
    for j, word in enumerate(line[:OUTPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            Y[i][j] = np.array(token_to_index[word])
        else:
            print(word)            
    i = i + 1
    prev_line = line
        
print(X)        
#        X[i][j] = word
#    X[i] = np.asarray(line[:INPUT_SEQUENCE_LENGTH])


model.fit(X, Y, batch_size=, nb_epoch=5, show_accuracy=True, verbose=1)

SyntaxError: invalid syntax (<ipython-input-5-3470baf103de>, line 35)

In [None]:
def predict(text):
    res = m.parseToNode(text)
    words = []
    while res:
        words.append(res.surface)
        res = res.next
    words.append(EOS_SYMBOL)
    X = np.zeros((1, INPUT_SEQUENCE_LENGTH, vec_size), dtype=np.float)
    for j, word in enumerate(words[:INPUT_SEQUENCE_LENGTH]):
        if word in wv_model.wv.vocab:
            X[0][j] = np.array(wv_model[word])
    t = model.predict(X)[0]
    indexes = np.argmax(t, axis=1)
    return ''.join([index_to_token[index] for index in indexes])
    
print(predict("べんきょうがんばれ"))


    