In [1]:
import re
import MeCab
import pickle
import numpy as np
import data_helpers as dh
import pandas as pd
import time
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.manifold import MDS,TSNE
from collections import Counter
from collections import OrderedDict
from gensim.models import word2vec
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
All_df = pd.read_pickle("../data/all_mecab.pickle")
All_df.head(3)

# senとsen_preの単語をIDに変換し、新たな列としてAll_dfに追加する

Unnamed: 0,sen,sen_pre,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,"[よろしく, お願い, し, ます, ！, EOS]",[EOS],1,-1,1234568,まこぴす,よろしくお願いします！,5,5,1,1,1,1,0,0,0,0
1,"[よろしく, お願い, し, ます, EOS]","[よろしく, お願い, し, ます, ！, EOS]",31,-1,1234568,哲,よろしくお願いします,5,5,1,1,1,1,0,0,0,0
2,"[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...","[よろしく, お願い, し, ます, EOS]",70,-1,1234568,仙波,名前なのが恥ずかしいです…\nよろしくお願いします！,5,5,1,1,1,1,0,0,0,0


# senとsen_preの単語をIDに変換し、新たな列としてAll_dfに追加する

In [3]:
sen = All_df['sen'].values
sen_pre = All_df['sen_pre'].values

# 単語辞書の作成
wd_set = Counter([x for s in (sen + sen_pre) for x in s])
wd_ary = np.array(list(wd_set.keys()))
wd_cnt = np.array(list(wd_set.values()))

# 出現頻度順にソート
wd_ary = wd_ary[np.argsort(wd_cnt)[::-1]]
wd_cnt.sort()
wd_cnt = wd_cnt[::-1]

# 単語ID辞書の作成
wd_to_id = {wd: i for i, wd in enumerate(wd_ary)}

# Top10の単語を出力
for i in range(10):
    print("単語:",
          list(wd_ary)[i], "\t出現数:",
          list(wd_cnt)[i], "\tID:", wd_to_id[list(wd_ary)[i]])

# 出現数CUT_OFF以下の単語のIDを統一
CUT_OFF = 2
print("words kinds:", len(wd_cnt), "words>=" + str(CUT_OFF) + ":",
      np.sum(wd_cnt >= CUT_OFF))
other_id = np.sum(wd_cnt >= CUT_OFF)
wd_to_id.update({wd: other_id for wd in wd_ary[wd_cnt < CUT_OFF]})
id_to_wd = {wd_to_id[wd]: wd for wd in wd_to_id.keys()}

# senとsen_preの単語をIDに変換
sen_id = []
sen_pre_id = []
for s, s_pre in zip(sen, sen_pre):
    sen_id.append([str(wd_to_id[wd]) for wd in s])
    sen_pre_id.append([str(wd_to_id[wd]) for wd in s_pre])

# 新し列としてAll_dfに追加
All_df.insert(loc=0, column='sen_id', value=sen_id)
All_df.insert(loc=1, column='sen_pre_id', value=sen_pre_id)

単語: EOS 	出現数: 29580 	ID: 0
単語: の 	出現数: 10567 	ID: 1
単語: て 	出現数: 7408 	ID: 2
単語: です 	出現数: 7390 	ID: 3
単語: ます 	出現数: 7363 	ID: 4
単語: か 	出現数: 7285 	ID: 5
単語: 、 	出現数: 6959 	ID: 6
単語: に 	出現数: 6914 	ID: 7
単語: が 	出現数: 6893 	ID: 8
単語: は 	出現数: 6793 	ID: 9
words kinds: 6961 words>=2: 6649


In [4]:
All_df.head(3)

Unnamed: 0,sen_id,sen_pre_id,sen,sen_pre,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,"[39, 35, 12, 4, 18, 0]",[0],"[よろしく, お願い, し, ます, ！, EOS]",[EOS],1,-1,1234568,まこぴす,よろしくお願いします！,5,5,1,1,1,1,0,0,0,0
1,"[39, 35, 12, 4, 0]","[39, 35, 12, 4, 18, 0]","[よろしく, お願い, し, ます, EOS]","[よろしく, お願い, し, ます, ！, EOS]",31,-1,1234568,哲,よろしくお願いします,5,5,1,1,1,1,0,0,0,0
2,"[415, 19, 1, 8, 2222, 3, 0, 39, 35, 12, 4, 18, 0]","[39, 35, 12, 4, 0]","[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...","[よろしく, お願い, し, ます, EOS]",70,-1,1234568,仙波,名前なのが恥ずかしいです…\nよろしくお願いします！,5,5,1,1,1,1,0,0,0,0


# word2vec を利用し、単語のベクトル辞書を作成する

In [5]:
sen_id = All_df['sen_id'].values
sen_pre_id = All_df['sen_pre_id'].values
sen_all = np.hstack((sen_id, sen_pre_id))

max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

word_vectors_size = 200

model = dh.get_w2v_model()

max_sen_length: 292


# （重要）各センテンスの長さを66に統一する

In [6]:
All_df['sen_id'] = [x[:66] for x in All_df['sen_id']]
All_df['sen_pre_id'] = [x[:66] for x in All_df['sen_pre_id']]

sen_all = np.hstack((All_df['sen_id'].values, All_df['sen_pre_id'].values))
max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

max_sen_length: 66


# データの準備
* sentences dataをpaddingし、word vectorsによりfeature vectorsを作る
* labels dataをone hotの型に変換する

In [7]:
# データの整理（一致、重複）
print("全データ(All_df)：", All_df.shape)
All_drop_df = All_df.drop_duplicates(subset=['body', 'name']).reset_index(drop=True)
print("重複投稿を排除したデータ(All_drop_df)：", All_drop_df.shape, "\n")



all_sen = All_drop_df['sen_id'].values
all_sen = np.array([np.array(x, dtype=np.int32) for x in all_sen])
x = dh.sen_to_fv(all_sen, max_sen_length, model, False)
print("input data(sen)：",x.shape)
all_sen_pre = All_drop_df['sen_pre_id'].values
all_sen_pre = np.array([np.array(x, dtype=np.int32) for x in all_sen_pre])
x_pre = dh.sen_to_fv(all_sen_pre, max_sen_length, model, False)
print("input data(sen_pre)：",x_pre.shape)


全データ(All_df)： (12012, 19)
重複投稿を排除したデータ(All_drop_df)： (11357, 19) 

input data(sen)： (11357, 66, 200)
input data(sen_pre)： (11357, 66, 200)


In [8]:
# 16types-------------------------------------
print("16types_a：", Counter(All_df['16types_a']), "\n")
print("argument_a：", Counter(All_df['argument_a']), "\n")

label_t16_a = All_drop_df['16types_a'].values
label_t16_a = np.array(label_t16_a, dtype=np.int32)
y_t16_a = dh.labels_to_one_hot(label_t16_a, 16)
print("input data(t16_a)：", y_t16_a.shape)

label_arg_a = All_drop_df['argument_a'].values
label_arg_a = np.array(label_arg_a, dtype=np.int32)
y_arg_a = dh.labels_to_one_hot(label_arg_a, 6)
print("input data(arg_a)：", y_arg_a.shape)

data = dh.set_data_sets_2(x, x_pre, y_t16_a, y_arg_a)


print(data.test.labels_1.shape)
print(data.test.labels_2.shape)

16types_a： Counter({1: 2425, 2: 1938, 6: 1307, 3: 1301, 4: 1230, 5: 1224, 8: 550, 9: 510, 7: 393, 14: 310, 15: 218, 11: 184, 0: 143, 10: 123, 12: 100, 13: 56}) 

argument_a： Counter({1: 5804, 2: 3204, 0: 2352, 3: 509, 4: 120, 5: 23}) 

input data(t16_a)： (11357, 16)
input data(arg_a)： (11357, 6)
(1136, 16)
(1136, 6)


In [9]:
# Network Parameters
num_input = 200
num_hidden = 200
num_classes_1 = 16
num_classes_2 = 6
train_dropout = 1.0
test_dropout = 1.0
embed_dim = word_vectors_size
sents_len = max_sen_length


# Training Parameters
learning_rate = 0.01
batch_size = 64
total_batch = int(data.train.num_examples / batch_size)
print(total_batch)
training_epochs = 100

# tf Graph input
X = tf.placeholder(dtype=tf.float32, shape=[None, sents_len, embed_dim])
Y_1 = tf.placeholder(dtype=tf.float32, shape=[None, num_classes_1])
Y_2 = tf.placeholder(dtype=tf.float32, shape=[None, num_classes_2])
keep_prob = tf.placeholder(tf.float32)

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_input, num_hidden])),
    'out_1': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_hidden, num_classes_1])),
    'out_2': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_hidden, num_classes_2]))
}
biases = {
    'h1': tf.Variable(tf.constant(value=0.1, shape=[num_hidden])),
    'out_1': tf.Variable(tf.constant(value=0.1, shape=[num_classes_1])),
    'out_2': tf.Variable(tf.constant(value=0.1, shape=[num_classes_2]))
}

159


In [10]:
# Create model
def NN(x, weights, biases, dropout):

    avg = tf.reduce_mean(x, axis=1) # [None, embed_dim]

    h1 = tf.add(tf.matmul(avg, weights['h1']), biases['h1'])
    h1_relu = tf.nn.relu(h1)
    
    h1_drop = tf.nn.dropout(h1_relu, dropout)
    
    out_1 = tf.add(tf.matmul(h1_drop, weights['out_1']), biases['out_1'])
    out_2 = tf.add(tf.matmul(h1_drop, weights['out_2']), biases['out_2'])
    return out_1, out_2

In [11]:
# Construct model
y_pred_1, y_pred_2 = NN(X, weights, biases, keep_prob)

# y_softmax = tf.nn.softmax(y_pred)

# Define loss and optimizer
# type 1(old):
# loss = tf.reduce_mean(
#     -tf.reduce_sum(Y * tf.log(y_softmax), reduction_indices=[1]))
# type 2(server):
# loss = tf.reduce_mean(
#     tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=y_pred))
# type 3(new):
# loss = tf.reduce_mean(
#     tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y, logits=y_pred))


loss_1 = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=Y_1, logits=y_pred_1))

loss_2 = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=Y_2, logits=y_pred_2))

loss_joint = loss_1 + loss_2

optimizer_1 = tf.train.AdamOptimizer(learning_rate).minimize(loss_1)
optimizer_2 = tf.train.AdamOptimizer(learning_rate).minimize(loss_2)

optimizer_joint = tf.train.AdamOptimizer(learning_rate).minimize(loss_joint)


# Evaluate model
pred_1 = tf.argmax(y_pred_1, 1)
true_1 = tf.argmax(Y_1, 1)
correct_prediction_1 = tf.equal(pred_1, true_1)
accuracy_1 = tf.reduce_mean(tf.cast(correct_prediction_1, tf.float32))


pred_2 = tf.argmax(y_pred_2, 1)
true_2 = tf.argmax(Y_2, 1)
correct_prediction_2 = tf.equal(pred_2, true_2)
accuracy_2 = tf.reduce_mean(tf.cast(correct_prediction_2, tf.float32))

In [12]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

# Training cycle
all_test_x = data.test.vectors_1
all_test_y_1 = data.test.labels_1
all_test_y_2 = data.test.labels_2
start = time.time()
for epoch_i in range(training_epochs):
    ave_cost = 0
    for batch_i in range(total_batch):
        batch_x, _, batch_y_1, batch_y_2 = data.train.next_batch(batch_size, 2)
        _, c = sess.run(
            [optimizer_joint, loss_joint],
            feed_dict={
                X: batch_x,
                Y_1: batch_y_1,
                Y_2: batch_y_2,
                keep_prob: train_dropout
            })
        ave_cost += c / total_batch
    # Display logs per epoch step
    if epoch_i % 1 == 0:
        train_acc_1 = sess.run(
            accuracy_1,
            feed_dict={
                X: batch_x,
                Y_1: batch_y_1,
                keep_prob: test_dropout
            })
        train_acc_2 = sess.run(
            accuracy_2,
            feed_dict={
                X: batch_x,
                Y_2: batch_y_2,
                keep_prob: test_dropout
            })
        test_acc_1 = sess.run(
            accuracy_1,
            feed_dict={
                X: all_test_x,
                Y_1: all_test_y_1,
                keep_prob: test_dropout
            })
        test_acc_2 = sess.run(
            accuracy_2,
            feed_dict={
                X: all_test_x,
                Y_2: all_test_y_2,
                keep_prob: test_dropout
            })
        print("Ep:%3d Bc:%4d" % (epoch_i + 1, batch_i + 1),
              "| train_1=%.3f" % train_acc_1, "test_1=%.3f" % test_acc_1, 
              "| train_2=%.3f" % train_acc_2, "test_2=%.3f" % test_acc_2, "| loss_joint=%5.3f" % ave_cost)
end = time.time()
print("Process Time :%.2f s" % (end - start))
sess.close()

Ep:  1 Bc: 159 | train_1=0.234 test_1=0.158 | train_2=1.000 test_2=0.199 | loss_joint=3.624
Ep:  2 Bc: 159 | train_1=0.312 test_1=0.298 | train_2=0.469 test_2=0.523 | loss_joint=3.499
Ep:  3 Bc: 159 | train_1=0.297 test_1=0.298 | train_2=0.438 test_2=0.541 | loss_joint=3.238
Ep:  4 Bc: 159 | train_1=0.391 test_1=0.317 | train_2=0.453 test_2=0.518 | loss_joint=3.196
Ep:  5 Bc: 159 | train_1=0.344 test_1=0.304 | train_2=0.594 test_2=0.544 | loss_joint=3.158
Ep:  6 Bc: 159 | train_1=0.281 test_1=0.325 | train_2=0.531 test_2=0.546 | loss_joint=3.141
Ep:  7 Bc: 159 | train_1=0.375 test_1=0.347 | train_2=0.609 test_2=0.553 | loss_joint=3.081
Ep:  8 Bc: 159 | train_1=0.469 test_1=0.353 | train_2=0.469 test_2=0.561 | loss_joint=3.049
Ep:  9 Bc: 159 | train_1=0.328 test_1=0.359 | train_2=0.562 test_2=0.559 | loss_joint=3.012
Ep: 10 Bc: 159 | train_1=0.391 test_1=0.377 | train_2=0.609 test_2=0.562 | loss_joint=3.000
Ep: 11 Bc: 159 | train_1=0.391 test_1=0.371 | train_2=0.516 test_2=0.567 | loss_

Ep: 91 Bc: 159 | train_1=0.484 test_1=0.450 | train_2=0.562 test_2=0.583 | loss_joint=2.770
Ep: 92 Bc: 159 | train_1=0.422 test_1=0.438 | train_2=0.547 test_2=0.575 | loss_joint=2.761
Ep: 93 Bc: 159 | train_1=0.312 test_1=0.434 | train_2=0.562 test_2=0.555 | loss_joint=2.738
Ep: 94 Bc: 159 | train_1=0.359 test_1=0.436 | train_2=0.500 test_2=0.578 | loss_joint=2.737
Ep: 95 Bc: 159 | train_1=0.375 test_1=0.441 | train_2=0.562 test_2=0.577 | loss_joint=2.738
Ep: 96 Bc: 159 | train_1=0.391 test_1=0.424 | train_2=0.625 test_2=0.578 | loss_joint=2.740
Ep: 97 Bc: 159 | train_1=0.406 test_1=0.418 | train_2=0.562 test_2=0.584 | loss_joint=2.746
Ep: 98 Bc: 159 | train_1=0.406 test_1=0.440 | train_2=0.453 test_2=0.581 | loss_joint=2.735
Ep: 99 Bc: 159 | train_1=0.406 test_1=0.452 | train_2=0.516 test_2=0.580 | loss_joint=2.723
Ep:100 Bc: 159 | train_1=0.359 test_1=0.435 | train_2=0.547 test_2=0.581 | loss_joint=2.730
Process Time :42.15 s
