## モジュールの準備

In [None]:
# spacy/ginza
!pip install ginza
!pip install ja-ginza
# ginzaのためのパッケージリソース一覧をリロード
# https://www.mojirca.com/2019/10/colab-load-ginza.html
import pkg_resources, imp
imp.reload(pkg_resources)



## 単語ベクトルの確認

In [None]:
import spacy
import pandas as pd

pd.set_option('max_colwidth', 100)

nlp = spacy.load('ja_ginza')
sentence = '私は焼肉をよく食べる'
doc = nlp(sentence)
tokens = []
vectors = []
for token in doc:
    tokens.append(token)
    vectors.append(token.vector)

pd.DataFrame({
    'token': tokens,
    'vector': vectors
})

In [None]:
vectors[0].shape

## データの準備

In [None]:
!mkdir data
!wget http://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2 -O data/KNBC_v1.0_090925_utf8.tar.bz2

In [None]:
%cd data
!tar xvf KNBC_v1.0_090925_utf8.tar.bz2
%cd ..

In [None]:
import re
import pandas as pd
import numpy as np

def get_sentences_from_text(filename):
  sentences = []
  with open(filename, 'r') as f:
    for i, line in enumerate(f):
      sentence = line.split('\t')[1].strip()
      if sentence == '': # 空文字を除去。
        continue
      if re.match('^http.*$', sentence): # URLを除去。
        continue
      sentences.append(sentence)
  return sentences

In [None]:
import os

root_dir = 'data/KNBC_v1.0_090925_utf8/corpus2'
targets = ['Gourmet', 'Keitai', 'Kyoto', 'Sports']

original_data = []
for target in targets:
  filename = os.path.join(root_dir, f'{target}.tsv')
  sentences = get_sentences_from_text(filename)
  for sentence in sentences:
    original_data.append([target, sentence])

original_df = pd.DataFrame(original_data, columns=['target', 'sentence'])

In [None]:
display(original_df.head())
display(original_df.tail())
display(pd.DataFrame(original_df['target'].value_counts()))

## LSTMによる分類タスク

### トレーニング用データに変換

In [None]:
import spacy
import numpy as np

nlp = spacy.load('ja_ginza')
target2index = pd.get_dummies(targets)

def get_features_and_labels_for_spacy(original_df):
  features = []
  labels = []
  max_feature_len = 0
  for i, original in enumerate(original_df.iterrows()):
    sentence = original[1]['sentence']
    target = original[1]['target']

    doc = nlp(sentence)
    feature = [token.vector for token in doc]
    max_feature_len = max(max_feature_len, len(feature))
    label = target2index[target].values

    features.append(feature)
    labels.append(label)

  return np.asarray(features), np.asarray(labels), max_feature_len

In [None]:
from sklearn.model_selection import train_test_split

features, labels, max_feature_len = get_features_and_labels_for_spacy(original_df)
print(max_feature_len)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = pad_sequences(features, maxlen=max_feature_len, dtype='float32') # dtypeの指定を忘れるとひどいことになるので注意。
(train_features, test_features, train_labels, test_labels) = train_test_split(features, labels, test_size=0.2)

print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)

### モデル構築

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

inputs = Input(shape=(max_feature_len, train_features.shape[2]))
x = LSTM(64)(inputs)
outputs = Dense(len(targets), activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'mse'])
model.summary()

### トレーニング実行

In [None]:
model.fit(train_features, train_labels, validation_split=0.1, verbose=1, epochs=3)

### クラシフィケーションレポート

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

predicted_test_labels = model.predict(test_features).argmax(axis=1)
numeric_test_labels = test_labels.argmax(axis=1)

report = classification_report(numeric_test_labels, predicted_test_labels, target_names=targets, output_dict=True)

display(pd.DataFrame(report).T)

### 手動で評価

In [None]:
check_data = ('Kyoto', '金閣寺が見たいです。') # targetは何でも構いません。

check_df = pd.DataFrame([check_data], columns=['target', 'sentence'])
check_features = get_features_and_labels_for_spacy(check_df)[0]
check_features = pad_sequences(check_features, maxlen=max_feature_len, dtype='float32')
check_predict = model.predict(check_features)[0]

print(f'{targets[check_predict.argmax()]}: {round(check_predict[check_predict.argmax()] * 100, 1)}%')

## LSTMによる自然言語生成

### トレーニング用データに変換

In [None]:
input_len = 8

def get_features_and_labels_for_nlg(original_df):
    features = []
    labels = []
    for sentence in original_df['sentence']:
        doc = nlp(sentence)
        if len(doc) <= input_len:
            continue
        for i in range(len(doc)) :
            if (i + input_len) >= len(doc):
                break
            feature = [token.vector for token in doc[i:i + input_len]]
            label = doc[i + input_len]

            features.append(feature)
            labels.append(label.text)
    
    return np.array(features), labels

In [None]:
features, labels = get_features_and_labels_for_nlg(original_df)

In [None]:
token2index = {label: i for i, label in enumerate(set(labels))}
index2token = {i: token for token, i in token2index.items()}

In [None]:
from tensorflow.keras.utils import to_categorical

onehot_labels = to_categorical([token2index[label] for label in labels])

### モデル構築

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Model

inputs = Input(shape=(features.shape[1], features.shape[2]))
x = Bidirectional(LSTM(256))(inputs)
x = Dropout(0.1)(x)
outputs = Dense(onehot_labels.shape[1], activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'mse'])
model.summary()

### トレーニング実行

In [None]:
model.fit(features, onehot_labels, validation_split=0.1, verbose=1, epochs=10)

### 実験

In [None]:
def sample_with_temp(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probs)

In [None]:
seq = 20
test_sentence = 'この季節には京都に行って'
display_sentence = test_sentence

for i in range(seq):
    test_doc = nlp(test_sentence)[-input_len:]
    test_features = np.array([[token.vector for token in test_doc]])
    preds = model.predict(test_features)
    max_index = sample_with_temp(preds[0], 0.3)
    next_token = index2token[max_index]
    test_sentence += next_token
    display_sentence = display_sentence + '|' + next_token
    if next_token == '。':
        break

print(display_sentence)