# AutoML Natural Languageのデモデータを利用したベンチマーク(自作モデル側)

## 概要

GCPのAutoML Natural Languageの性能を確認するために、AutoMLのデモデータを利用して自作モデルとの精度を比較します。
以下は自作モデルのソースコードです。

## データのロード

In [1]:
import pandas as pd
import glob
import os
import numpy as np

data_filename = os.path.join('data', 'happiness.csv')
data_df = pd.read_csv(data_filename, header=None)

data_df

Unnamed: 0,0,1
0,We had a serious talk with some friends of our...,bonding
1,I meditated last night.,leisure
2,My grandmother start to walk from the bed afte...,affection
3,I picked my daughter up from the airport and w...,bonding
4,when i received flowers from my best friend,bonding
5,I went shopping,leisure
6,The phone that I have ordered in a local onlin...,enjoy_the_moment
7,I bought a new TV,achievement
8,I slow cooked a chuck roast on my stove that c...,achievement
9,I lost 2 kgs of weight after a month long effo...,achievement


## ラベルのベクトル化

In [2]:
from keras import utils

printable_labels = data_df[1].values
unique_labels = data_df[1].unique()
unique_labels = {v: k for k, v in enumerate(unique_labels)}
printable_features = data_df[0].values

class_count = len(unique_labels)

labels = []
for i, label in enumerate(printable_labels):
    labels.append(unique_labels[label])

labels = utils.np_utils.to_categorical(labels, num_classes=class_count)

print(labels.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


(12697, 7)


## Word2Vecのロード

学習済みモデルとして[word2vec-slim](https://github.com/eyaler/word2vec-slim)を利用します。

In [3]:
import numpy as np
import os
from gensim.models import KeyedVectors

word2vec_model_path = os.path.join('models', 'GoogleNews-vectors-negative300-SLIM.bin')
word2vec = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
word2vec_origin = word2vec.wv.index2word

index2word = { i: c for i, c in enumerate(word2vec_origin) }
word2index = { c: i for i, c in enumerate(word2vec_origin) }

  import sys


## 文書毎の単語の最大数計算

In [7]:
from keras.preprocessing.text import text_to_word_sequence

max_len = 0
for i, sentence in enumerate(printable_features):
    for t, word in enumerate(text_to_word_sequence(sentence)):
        max_len = max(max_len, t + 1)

print(max_len)

762


## 特徴量のベクトル化

In [8]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing import sequence

features = np.zeros((len(printable_features), max_len), dtype=np.int32)
for i, sentence in enumerate(printable_features):
    for t, word in enumerate(text_to_word_sequence(sentence)):
        try:
            features[i, t] = word2index[word.lower()]
        except:
            features[i, t] = 0
            #logging.warn(f'{word} is skipped.')
            continue

print(features.shape)

(12697, 762)


## 学習データとテストデータの分離

In [9]:
from sklearn.model_selection import train_test_split

tmp_data = train_test_split(features, labels, train_size = 0.7, test_size = 0.3)
train_features, valid_features, train_labels, valid_labels = map(lambda vec: np.asarray(vec), tmp_data)

print(train_features.shape)
print(train_labels.shape)
print(valid_features.shape)
print(valid_labels.shape)

(8887, 762)
(8887, 7)
(3810, 762)
(3810, 7)


## モデル構造の作成

In [10]:
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers import Dense, Dropout, GRU
from keras.layers.wrappers import Bidirectional
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, EarlyStopping, ModelCheckpoint, TensorBoard
from keras import layers
from keras.layers.normalization import BatchNormalization
from keras import Input, Model
import datetime

input_tensor = Input(train_features[0].shape)
common_input = word2vec.wv.get_keras_embedding(train_embeddings=False)(input_tensor)

x1 = Bidirectional(GRU(256))(common_input)

output_tensor = Dense(class_count, activation='softmax')(x1)

model = Model(input_tensor, output_tensor)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['mae'])
model.summary()

  del sys.path[0]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 762)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 762, 300)          89870100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               855552    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 3591      
Total params: 90,729,243
Trainable params: 859,143
Non-trainable params: 89,870,100
_________________________________________________________________


## 学習の実行

In [14]:
datestr = datetime.datetime.now().strftime('%Y%m%d')

model_path = os.path.join('models', f'automl_test_model_{datestr}.h5')

model.fit(train_features,
          train_labels,
          epochs = 100,
          batch_size = 256,
          validation_split = 0.1,
          callbacks = [
              TensorBoard(log_dir = 'tflog'),
              EarlyStopping(patience=5, monitor='val_mean_absolute_error'),
              ModelCheckpoint(model_path, monitor='val_mean_absolute_error', save_best_only=True)
          ])

Train on 7998 samples, validate on 889 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x23218a6cb00>

## テストデータによる検証

In [15]:
predicted_valid_labels = model.predict(valid_features).argmax(axis=1)
numeric_valid_labels = valid_labels.argmax(axis=1)

## クラシフィケーションレポートの表示

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(numeric_valid_labels, predicted_valid_labels, target_names=unique_labels))

                  precision    recall  f1-score   support

         bonding       0.72      0.91      0.80       506
         leisure       0.74      0.56      0.64       298
       affection       0.92      0.90      0.91      1277
enjoy_the_moment       0.57      0.55      0.56       408
     achievement       0.84      0.85      0.85      1187
          nature       0.91      0.53      0.67        73
        exercise       0.73      0.84      0.78        61

     avg / total       0.82      0.81      0.81      3810

