In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import numpy.matlib

In [3]:
np.random.seed(151)

# 演習3

## データのロード

In [4]:
import json
with open("data/Gourmet.json") as fp:
    data = json.loads(fp.read())

In [5]:
train = data["train"]
test = data["test"]

In [6]:
train[0]

['さらに', '肉', 'が', '厚い', '。']

In [7]:
len(train)

710

## 辞書の作成

In [8]:
wdic = {}
wdic_inv = ["</s>"]
wdic["</s>"] = 0

count = 1
for words in train:
    for w in words:
        if w not in wdic:
            wdic[w] = count
            wdic_inv.append(w)
            count += 1

In [9]:
len(wdic)

2389

## 単語IDに変換

In [10]:
from keras.preprocessing.sequence import pad_sequences

Using Theano backend.
Using gpu device 0: GRID K2 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5110)


In [11]:
train_ids = []
for words in train:
    ids = []
    for w in words:
        ids.append(wdic[w])
    ids.append(wdic["</s>"])
    train_ids.append(ids)

test_ids = []
for words in test:
    ids = []
    for w in words:
        ids.append(wdic.get(w, 0))
    ids.append(wdic["</s>"])
    test_ids.append(ids)
    
train_ids = pad_sequences(train_ids, padding="post", value=wdic["</s>"])
test_ids = pad_sequences(test_ids, padding="post", value=wdic["</s>"], maxlen=train_ids.shape[1])

In [12]:
train_ids.shape, test_ids.shape

((710, 75), (178, 75))

## Kerasで読み込める形に変換

In [13]:
from keras.utils.np_utils import to_categorical

In [14]:
train_x = train_ids[:, :-1]
train_y = to_categorical(train_ids[:, 1:]).reshape((train_ids.shape[0], train_ids.shape[1] - 1, -1))
test_x = test_ids[:, :-1]
test_y = to_categorical(test_ids[:, 1:], nb_classes=train_y.shape[2]).reshape((test_ids.shape[0], test_ids.shape[1] - 1, -1))

In [15]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((710, 74), (710, 74, 2389), (178, 74), (178, 74, 2389))

## モデルの構築

In [16]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed, Activation, LSTM, Dropout, Embedding, Masking

In [17]:
model = Sequential()
model.add(Embedding(len(wdic)+1, 100, mask_zero=True))
model.add(LSTM(output_dim=100, return_sequences=True))
model.add(Dropout(0.5))
model.add(Activation("tanh"))
model.add(TimeDistributed(Dense(len(wdic), activation="softmax")))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

## 学習

In [18]:
%%time
model.fit(train_x, train_y, batch_size=100, nb_epoch=10, validation_split=0.1)

Train on 639 samples, validate on 71 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 36.7 s, sys: 13.5 s, total: 50.2 s
Wall time: 50.1 s


<keras.callbacks.History at 0x7fb15284f7b8>

## 学習済みデータをロードします

通常、このようなモデルは学習に数時間以上(試行錯誤も含めると数日以上)かかるため、モデルを学習しておきました。学習済みモデルをロードして挙動を見てみましょう。

In [19]:
import keras.models

In [20]:
hiden = keras.models.load_model("data/hiden_no_tare_enshu3.h5")

## 次の単語を予測してみましょう

In [21]:
pred = hiden.predict_classes(test_x)



In [22]:
pred.shape

(178, 74)

In [23]:
import matplotlib.pyplot as plt

In [24]:
for i in np.random.choice((test_x.shape[0]), 10):
    print("*** {} ***".format(i))
    words = []
    for wid in pred[i]:
        words.append(wdic_inv[wid])
        if len(words) >= len(test[i]):
            break
    df = pd.DataFrame(dict(t=test[i], y=words)).T
    print(df.to_string())

*** 65 ***
   0   1   2  3  4     5  6   7  8    9     10 11  12  13 14  15  16    17   18 19 20  21    22
t  つゆ   を  つけ  ず  、  そのまま  で  食べ  て    も  おいしい  し   、  つゆ  や  その  薬味  とともに  食べる  の  も  いい     。
y  食べ  種類   て  て  「     「  も   に  も  やっと     ー  た  いい  いい  も  ない  ない    ない   ない  。  。   。  </s>
*** 57 ***
  0    1    2   3   4   5   6  7  8  9    10 11 12  13  14    15
t  心    の   広い  友人  たち   を  持っ  て  い  て  よかっ  た  と  思っ   た     。
y  の  食生活  食生活   と   を  食べ   た  き  た  、    た  ば  ば   て  ので  </s>
*** 67 ***
    0   1     2     3   4  5  6  7
t  泣く   子     も    黙る   ３  斤  だ  ！
y  食べ  食べ  たこ焼き  たこ焼き  種類  ６  ！  ！
*** 52 ***
   0   1  2    3  4    5   6  7     8
t  私  たち  は  感づい  て  しまい  まし  た     。
y  は   が  、    、  も    、   た  。  </s>
*** 61 ***
      0    1  2    3   4  5     6
t    まず  小麦粉  を  こぼし  まし  た     。
y  炭水化物    が  生    生   た  。  </s>
*** 48 ***
   0   1  2  3    4    5  6   7  8   9       10 11 12  13 14  15 16   17 18    19    20
t  これ  画像  で  は  分かり  にくい  ん  です  が  実は  チ

## トップ10以内に正解が入る割合はどの程度あるでしょうか

In [27]:
def evaluate_one(ans, dist, n=10):
    top_n_wid = np.argsort(dist)[-n:]
    ans_wid = wdic.get(ans, 0)
    if ans_wid in top_n_wid:
        return True
    else:
        return False

def evaluate_line(target, line_pred):
    total = 0
    match = 0
    for i, dist in enumerate(line_pred):
        if i + 1 < len(target):
            total += 1
            if evaluate_one(target[i + 1], dist):
                match += 1
    rate = match / total if total > 0 else 1
    return total, match, rate

def evaluate(data_x, corpus):
    pred_dists = hiden.predict(data_x)
    t = 0
    m = 0
    r = 0.0
    for i, line_pred in enumerate(pred_dists):
        total, match, rate = evaluate_line(corpus[i], line_pred)
        t += total
        m += match
        r += rate
    return m / t * 100, r / pred_dists.shape[0] * 100

acc, ave_acc = evaluate(train_x, train)
print("Acc (Train): {} %".format(acc))
print("AveAcc (Train): {} %".format(ave_acc))
acc, ave_acc = evaluate(test_x, test)
print("Acc (Test): {} %".format(acc))
print("AveAcc (Test): {} %".format(ave_acc))

Acc (Train): 83.57603686635944 %
AveAcc (Train): 84.69880217063512 %
Acc (Test): 32.14831804281346 %
AveAcc (Test): 34.3718088856541 %
