In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import numpy.matlib

In [None]:
np.random.seed(151)

# 演習3

## データのロード

In [None]:
import json
with open("data/Gourmet.json") as fp:
    data = json.loads(fp.read())

In [None]:
train = data["train"]
test = data["test"]

In [None]:
train[0]

In [None]:
len(train)

## 辞書の作成

In [None]:
wdic = {}
wdic_inv = ["</s>"]
wdic["</s>"] = 0

count = 1
for words in train:
    for w in words:
        if w not in wdic:
            wdic[w] = count
            wdic_inv.append(w)
            count += 1

In [None]:
len(wdic)

## 単語IDに変換

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
train_ids = []
for words in train:
    ids = []
    for w in words:
        ids.append(wdic[w])
    ids.append(wdic["</s>"])
    train_ids.append(ids)

test_ids = []
for words in test:
    ids = []
    for w in words:
        ids.append(wdic.get(w, 0))
    ids.append(wdic["</s>"])
    test_ids.append(ids)
    
train_ids = pad_sequences(train_ids, padding="post", value=wdic["</s>"])
test_ids = pad_sequences(test_ids, padding="post", value=wdic["</s>"], maxlen=train_ids.shape[1])

In [None]:
train_ids.shape, test_ids.shape

## Kerasで読み込める形に変換

In [None]:
from keras.utils.np_utils import to_categorical

In [None]:
train_x = train_ids[:, :-1]
train_y = to_categorical(train_ids[:, 1:]).reshape((train_ids.shape[0], train_ids.shape[1] - 1, -1))
test_x = test_ids[:, :-1]
test_y = to_categorical(test_ids[:, 1:], nb_classes=train_y.shape[2]).reshape((test_ids.shape[0], test_ids.shape[1] - 1, -1))

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

## モデルの構築

In [None]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed, Activation, LSTM, Dropout, Embedding, Masking

In [None]:
model = Sequential()
model.add(Embedding(len(wdic)+1, 100, mask_zero=True))
model.add(LSTM(output_dim=100, return_sequences=True))
model.add(Dropout(0.5))
model.add(Activation("tanh"))
model.add(TimeDistributed(Dense(len(wdic), activation="softmax")))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

## 学習

In [None]:
%%time
model.fit(train_x, train_y, batch_size=100, nb_epoch=3, validation_split=0.1)

## 学習済みデータをロードします

通常、このようなモデルは学習に数時間以上(試行錯誤も含めると数日以上)かかるため、モデルを学習しておきました。学習済みモデルをロードして挙動を見てみましょう。

In [None]:
import keras.models

In [None]:
hiden = keras.models.load_model("data/hiden_no_tare_enshu3.h5")

## 次の単語を予測してみましょう

In [None]:
pred = hiden.predict_classes(test_x)

In [None]:
pred.shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
for i in np.random.choice((test_x.shape[0]), 10):
    print("*** {} ***".format(i))
    words = []
    for wid in pred[i]:
        words.append(wdic_inv[wid])
        if len(words) >= len(test[i]):
            break
    df = pd.DataFrame(dict(t=test[i], y=words)).T
    print(df.to_string())

## トップ10以内に正解が入る割合はどの程度あるでしょうか

In [None]:
def evaluate_one(ans, dist, n=10):
    top_n_wid = np.argsort(dist)[-n:]
    ans_wid = wdic.get(ans, 0)
    if ans_wid in top_n_wid:
        return True
    else:
        return False

def evaluate_line(target, line_pred):
    total = 0
    match = 0
    for i, dist in enumerate(line_pred):
        if i + 1 < len(target):
            total += 1
            if evaluate_one(target[i + 1], dist):
                match += 1
    rate = match / total if total > 0 else 1
    return total, match, rate

def evaluate(data_x, corpus):
    pred_dists = hiden.predict(data_x)
    t = 0
    m = 0
    r = 0.0
    for i, line_pred in enumerate(pred_dists):
        total, match, rate = evaluate_line(corpus[i], line_pred)
        t += total
        m += match
        r += rate
    return m / t * 100, r / pred_dists.shape[0] * 100

acc, ave_acc = evaluate(train_x, train)
print("Acc (Train): {} %".format(acc))
print("AveAcc (Train): {} %".format(ave_acc))
acc, ave_acc = evaluate(test_x, test)
print("Acc (Test): {} %".format(acc))
print("AveAcc (Test): {} %".format(ave_acc))