In [1]:
import re
import MeCab
import pickle
import numpy as np
import data_helpers as dh
import pandas as pd
import time
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.manifold import MDS,TSNE
from collections import Counter
from collections import OrderedDict
from gensim.models import word2vec
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
All_df = pd.read_pickle("../data/all_mecab.pickle")
All_df.head(3)

# senとsen_preの単語をIDに変換し、新たな列としてAll_dfに追加する

Unnamed: 0,sen,sen_pre,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,"[よろしく, お願い, し, ます, ！, EOS]",[EOS],1,-1,1234568,まこぴす,よろしくお願いします！,5,5,1,1,1,1,0,0,0,0
1,"[よろしく, お願い, し, ます, EOS]","[よろしく, お願い, し, ます, ！, EOS]",31,-1,1234568,哲,よろしくお願いします,5,5,1,1,1,1,0,0,0,0
2,"[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...","[よろしく, お願い, し, ます, EOS]",70,-1,1234568,仙波,名前なのが恥ずかしいです…\nよろしくお願いします！,5,5,1,1,1,1,0,0,0,0


# senとsen_preの単語をIDに変換し、新たな列としてAll_dfに追加する

In [3]:
sen = All_df['sen'].values
sen_pre = All_df['sen_pre'].values

# 単語辞書の作成
wd_set = Counter([x for s in (sen + sen_pre) for x in s])
wd_ary = np.array(list(wd_set.keys()))
wd_cnt = np.array(list(wd_set.values()))

# 出現頻度順にソート
wd_ary = wd_ary[np.argsort(wd_cnt)[::-1]]
wd_cnt.sort()
wd_cnt = wd_cnt[::-1]

# 単語ID辞書の作成
wd_to_id = {wd: i for i, wd in enumerate(wd_ary)}

# Top10の単語を出力
for i in range(10):
    print("単語:",
          list(wd_ary)[i], "\t出現数:",
          list(wd_cnt)[i], "\tID:", wd_to_id[list(wd_ary)[i]])

# 出現数CUT_OFF以下の単語のIDを統一
CUT_OFF = 2
print("words kinds:", len(wd_cnt), "words>=" + str(CUT_OFF) + ":",
      np.sum(wd_cnt >= CUT_OFF))
other_id = np.sum(wd_cnt >= CUT_OFF)
wd_to_id.update({wd: other_id for wd in wd_ary[wd_cnt < CUT_OFF]})
id_to_wd = {wd_to_id[wd]: wd for wd in wd_to_id.keys()}

# senとsen_preの単語をIDに変換
sen_id = []
sen_pre_id = []
for s, s_pre in zip(sen, sen_pre):
    sen_id.append([str(wd_to_id[wd]) for wd in s])
    sen_pre_id.append([str(wd_to_id[wd]) for wd in s_pre])

# 新し列としてAll_dfに追加
All_df.insert(loc=0, column='sen_id', value=sen_id)
All_df.insert(loc=1, column='sen_pre_id', value=sen_pre_id)

単語: EOS 	出現数: 29580 	ID: 0
単語: の 	出現数: 10567 	ID: 1
単語: て 	出現数: 7408 	ID: 2
単語: です 	出現数: 7390 	ID: 3
単語: ます 	出現数: 7363 	ID: 4
単語: か 	出現数: 7285 	ID: 5
単語: 、 	出現数: 6959 	ID: 6
単語: に 	出現数: 6914 	ID: 7
単語: が 	出現数: 6893 	ID: 8
単語: は 	出現数: 6793 	ID: 9
words kinds: 6961 words>=2: 6649


In [4]:
All_df.head(3)

Unnamed: 0,sen_id,sen_pre_id,sen,sen_pre,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,"[39, 35, 12, 4, 18, 0]",[0],"[よろしく, お願い, し, ます, ！, EOS]",[EOS],1,-1,1234568,まこぴす,よろしくお願いします！,5,5,1,1,1,1,0,0,0,0
1,"[39, 35, 12, 4, 0]","[39, 35, 12, 4, 18, 0]","[よろしく, お願い, し, ます, EOS]","[よろしく, お願い, し, ます, ！, EOS]",31,-1,1234568,哲,よろしくお願いします,5,5,1,1,1,1,0,0,0,0
2,"[415, 19, 1, 8, 2253, 3, 0, 39, 35, 12, 4, 18, 0]","[39, 35, 12, 4, 0]","[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...","[よろしく, お願い, し, ます, EOS]",70,-1,1234568,仙波,名前なのが恥ずかしいです…\nよろしくお願いします！,5,5,1,1,1,1,0,0,0,0


# word2vec を利用し、単語のベクトル辞書を作成する

In [5]:
sen_id = All_df['sen_id'].values
sen_pre_id = All_df['sen_pre_id'].values
sen_all = np.hstack((sen_id, sen_pre_id))

max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

word_vectors_size = 200

model = dh.get_w2v_model()

max_sen_length: 292


# （重要）各センテンスの長さを66に統一する

In [6]:
All_df['sen_id'] = [x[:66] for x in All_df['sen_id']]
All_df['sen_pre_id'] = [x[:66] for x in All_df['sen_pre_id']]

sen_all = np.hstack((All_df['sen_id'].values, All_df['sen_pre_id'].values))
max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

max_sen_length: 66


# データの準備
* sentences dataをpaddingし、word vectorsによりfeature vectorsを作る
* labels dataをone hotの型に変換する

In [7]:
# データの整理（一致、重複）
print("全データ(All_df)：", All_df.shape)
All_drop_df = All_df.drop_duplicates(subset=['body', 'name']).reset_index(drop=True)
print("重複投稿を排除したデータ(All_drop_df)：", All_drop_df.shape, "\n")



all_sen = All_df['sen_id'].values
all_sen = np.array([np.array(x, dtype=np.int32) for x in all_sen])
x = dh.sen_to_fv(all_sen, max_sen_length, model, False)
print("input data(sen)：",x.shape)
all_sen_pre = All_df['sen_pre_id'].values
all_sen_pre = np.array([np.array(x, dtype=np.int32) for x in all_sen_pre])
x_pre = dh.sen_to_fv(all_sen_pre, max_sen_length, model, False)
print("input data(sen_pre)：",x_pre.shape)


全データ(All_df)： (12012, 19)
重複投稿を排除したデータ(All_drop_df)： (11357, 19) 

input data(sen)： (12012, 66, 200)
input data(sen_pre)： (12012, 66, 200)


In [8]:
# 16types-------------------------------------
print("16types_a：", Counter(All_df['16types_a']), "\n")
print("16types_b：", Counter(All_df['16types_b']), "\n")

label_t16_a = All_df['16types_a'].values
label_t16_a = np.array(label_t16_a, dtype=np.int32)
y_t16_a = dh.labels_to_one_hot(label_t16_a, 16)
print("input data(t16_a)：", y_t16_a.shape)

label_t16_b = All_df['16types_b'].values
label_t_b = np.array(label_t16_b, dtype=np.int32)
y_t16_b = dh.labels_to_one_hot(label_t_b, 16)
print("input data(t16_b)：", y_t16_b.shape)

t16_data = dh.set_data_sets(x, y_t16_a, y_t16_b)

16types_a： Counter({1: 2425, 2: 1938, 6: 1307, 3: 1301, 4: 1230, 5: 1224, 8: 550, 9: 510, 7: 393, 14: 310, 15: 218, 11: 184, 0: 143, 10: 123, 12: 100, 13: 56}) 

16types_b： Counter({1: 2759, 2: 1994, 4: 1291, 5: 1262, 3: 1243, 6: 1131, 7: 729, 8: 407, 14: 327, 15: 231, 11: 193, 9: 183, 12: 113, 13: 90, 10: 34, 0: 25}) 

input data(t16_a)： (12012, 16)
input data(t16_b)： (12012, 16)


In [9]:
# Network Parameters
num_input = 200
num_hidden = 200
num_classes = 16
train_dropout = 1.0
test_dropout = 1.0
embed_dim = word_vectors_size
sents_len = max_sen_length


# Training Parameters
learning_rate = 0.01
batch_size = 64
total_batch = int(t16_data.train.num_examples / batch_size)
print(total_batch)
training_epochs = 100

# tf Graph input
X = tf.placeholder(dtype=tf.float32, shape=[None, sents_len, embed_dim])
Y_a = tf.placeholder(dtype=tf.float32, shape=[None, num_classes])
Y_b = tf.placeholder(dtype=tf.float32, shape=[None, num_classes])
keep_prob = tf.placeholder(tf.float32)

# Store layers weight & bias
weights_a = {
    'h1': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_input, num_hidden])),
    'out': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_hidden, num_classes]))
}
biases_a = {
    'h1': tf.Variable(tf.constant(value=0.1, shape=[num_hidden])),
    'out': tf.Variable(tf.constant(value=0.1, shape=[num_classes]))
}


weights_b = {
    'h1': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_input, num_hidden])),
    'out': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_hidden, num_classes]))
}
biases_b = {
    'h1': tf.Variable(tf.constant(value=0.1, shape=[num_hidden])),
    'out': tf.Variable(tf.constant(value=0.1, shape=[num_classes]))
}

168


In [10]:
# Create model
def NN(x, weights, biases, dropout):

    avg = tf.reduce_mean(x, axis=1) # [None, embed_dim]

    h1 = tf.add(tf.matmul(avg, weights['h1']), biases['h1'])
    h1_relu = tf.nn.relu(h1)
    
    h1_drop = tf.nn.dropout(h1_relu, dropout)
    
    out = tf.add(tf.matmul(h1_drop, weights['out']), biases['out'])
    return out

In [11]:
# Construct model
y_pred_a = NN(X, weights_a, biases_a, keep_prob)
y_pred_b = NN(X, weights_b, biases_b, keep_prob)

# y_softmax = tf.nn.softmax(y_pred)

# Define loss and optimizer
# type 1(old):
# loss = tf.reduce_mean(
#     -tf.reduce_sum(Y * tf.log(y_softmax), reduction_indices=[1]))
# type 2(server):
# loss = tf.reduce_mean(
#     tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=y_pred))
# type 3(new):
loss_a = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_a, logits=y_pred_a))

loss_b = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_b, logits=y_pred_b))


optimizer_a = tf.train.AdamOptimizer(learning_rate).minimize(loss_a)
optimizer_b = tf.train.AdamOptimizer(learning_rate).minimize(loss_b)


# Evaluate model
pred_a = tf.argmax(y_pred_a, 1)
true_a = tf.argmax(Y_a, 1)
correct_prediction_a = tf.equal(pred_a, true_a)
accuracy_a = tf.reduce_mean(tf.cast(correct_prediction_a, tf.float32))


pred_b = tf.argmax(y_pred_b, 1)
true_b = tf.argmax(Y_b, 1)
correct_prediction_b = tf.equal(pred_b, true_b)
accuracy_b = tf.reduce_mean(tf.cast(correct_prediction_b, tf.float32))

In [12]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

# Training cycle
all_test_x = t16_data.test.vectors_1
all_test_y_t16_a = t16_data.test.labels_1
all_test_y_t16_b = t16_data.test.labels_2
start = time.time()
for epoch_i in range(training_epochs):
    ave_cost_a = 0
    ave_cost_b = 0
    for batch_i in range(total_batch):
        batch_x, batch_y_t16_a, batch_y_t16_b = t16_data.train.next_batch(batch_size)
        _, c_a = sess.run(
            [optimizer_a, loss_a],
            feed_dict={
                X: batch_x,
                Y_a: batch_y_t16_a,
                keep_prob: train_dropout
            })
        _, c_b = sess.run(
            [optimizer_b, loss_b],
            feed_dict={
                X: batch_x,
                Y_b: batch_y_t16_b,
                keep_prob: train_dropout
            })
        ave_cost_a += c_a / total_batch
        ave_cost_b += c_b / total_batch
    # Display logs per epoch step
    if epoch_i % 1 == 0:
        train_acc_a = sess.run(
            accuracy_a,
            feed_dict={
                X: batch_x,
                Y_a: batch_y_t16_a,
                keep_prob: test_dropout
            })
        train_acc_b = sess.run(
            accuracy_b,
            feed_dict={
                X: batch_x,
                Y_b: batch_y_t16_b,
                keep_prob: test_dropout
            })
        test_acc_a = sess.run(
            accuracy_a,
            feed_dict={
                X: all_test_x,
                Y_a: all_test_y_t16_a,
                keep_prob: test_dropout
            })
        test_acc_b = sess.run(
            accuracy_b,
            feed_dict={
                X: all_test_x,
                Y_b: all_test_y_t16_b,
                keep_prob: test_dropout
            })
        print("Ep:%3d Bc:%4d" % (epoch_i + 1, batch_i + 1),
              "| train_a=%.3f" % train_acc_a, "test_a=%.3f" % test_acc_a, "loss_a=%5.3f" % ave_cost_a,
              "| train_b=%.3f" % train_acc_b, "test_b=%.3f" % test_acc_b, "loss_b=%5.3f" % ave_cost_b)
end = time.time()
print("Process Time :%.2f s" % (end - start))
sess.close()

Ep:  1 Bc: 168 | train_a=0.312 test_a=0.238 loss_a=2.194 | train_b=0.266 test_b=0.255 loss_b=2.208
Ep:  2 Bc: 168 | train_a=0.406 test_a=0.370 loss_a=1.985 | train_b=0.438 test_b=0.371 loss_b=1.988
Ep:  3 Bc: 168 | train_a=0.359 test_a=0.359 loss_a=1.893 | train_b=0.344 test_b=0.364 loss_b=1.892
Ep:  4 Bc: 168 | train_a=0.438 test_a=0.402 loss_a=1.839 | train_b=0.422 test_b=0.416 loss_b=1.834
Ep:  5 Bc: 168 | train_a=0.406 test_a=0.404 loss_a=1.830 | train_b=0.359 test_b=0.423 loss_b=1.819
Ep:  6 Bc: 168 | train_a=0.281 test_a=0.396 loss_a=1.790 | train_b=0.281 test_b=0.365 loss_b=1.791
Ep:  7 Bc: 168 | train_a=0.297 test_a=0.385 loss_a=1.774 | train_b=0.281 test_b=0.384 loss_b=1.780
Ep:  8 Bc: 168 | train_a=0.469 test_a=0.435 loss_a=1.752 | train_b=0.438 test_b=0.433 loss_b=1.750
Ep:  9 Bc: 168 | train_a=0.438 test_a=0.449 loss_a=1.742 | train_b=0.422 test_b=0.453 loss_b=1.750
Ep: 10 Bc: 168 | train_a=0.453 test_a=0.453 loss_a=1.726 | train_b=0.438 test_b=0.465 loss_b=1.730
Ep: 11 Bc:

Ep: 84 Bc: 168 | train_a=0.578 test_a=0.487 loss_a=1.568 | train_b=0.547 test_b=0.492 loss_b=1.572
Ep: 85 Bc: 168 | train_a=0.578 test_a=0.482 loss_a=1.575 | train_b=0.547 test_b=0.490 loss_b=1.587
Ep: 86 Bc: 168 | train_a=0.453 test_a=0.498 loss_a=1.572 | train_b=0.469 test_b=0.479 loss_b=1.574
Ep: 87 Bc: 168 | train_a=0.516 test_a=0.448 loss_a=1.578 | train_b=0.469 test_b=0.453 loss_b=1.585
Ep: 88 Bc: 168 | train_a=0.484 test_a=0.458 loss_a=1.565 | train_b=0.500 test_b=0.465 loss_b=1.568
Ep: 89 Bc: 168 | train_a=0.391 test_a=0.490 loss_a=1.562 | train_b=0.391 test_b=0.494 loss_b=1.573
Ep: 90 Bc: 168 | train_a=0.469 test_a=0.476 loss_a=1.569 | train_b=0.484 test_b=0.501 loss_b=1.580
Ep: 91 Bc: 168 | train_a=0.484 test_a=0.474 loss_a=1.563 | train_b=0.531 test_b=0.461 loss_b=1.581
Ep: 92 Bc: 168 | train_a=0.422 test_a=0.489 loss_a=1.559 | train_b=0.438 test_b=0.488 loss_b=1.573
Ep: 93 Bc: 168 | train_a=0.406 test_a=0.471 loss_a=1.576 | train_b=0.453 test_b=0.487 loss_b=1.583
Ep: 94 Bc: