In [1]:
import re
import MeCab
import pickle
import numpy as np
import data_helpers as dh
import pandas as pd
import time
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.manifold import MDS,TSNE
from collections import Counter
from collections import OrderedDict
from gensim.models import word2vec
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
All_df = pd.read_pickle("../data/all_mecab.pickle")

sen = All_df['sen'].values
sen_pre = All_df['sen_pre'].values

# 単語辞書の作成
wd_set = Counter([x for s in (sen + sen_pre) for x in s])
wd_ary = np.array(list(wd_set.keys()))
wd_cnt = np.array(list(wd_set.values()))

# 出現頻度順にソート
wd_ary = wd_ary[np.argsort(wd_cnt)[::-1]]
wd_cnt.sort()
wd_cnt = wd_cnt[::-1]

# 単語ID辞書の作成
wd_to_id = {wd: i for i, wd in enumerate(wd_ary)}

# Top10の単語を出力
for i in range(10):
    print("単語:",
          list(wd_ary)[i], "\t出現数:",
          list(wd_cnt)[i], "\tID:", wd_to_id[list(wd_ary)[i]])

# 出現数CUT_OFF以下の単語のIDを統一
CUT_OFF = 2
print("words kinds:", len(wd_cnt), "words>=" + str(CUT_OFF) + ":",
      np.sum(wd_cnt >= CUT_OFF))
other_id = np.sum(wd_cnt >= CUT_OFF)
wd_to_id.update({wd: other_id for wd in wd_ary[wd_cnt < CUT_OFF]})
id_to_wd = {wd_to_id[wd]: wd for wd in wd_to_id.keys()}

# senとsen_preの単語をIDに変換
sen_id = []
sen_pre_id = []
for s, s_pre in zip(sen, sen_pre):
    sen_id.append([str(wd_to_id[wd]) for wd in s])
    sen_pre_id.append([str(wd_to_id[wd]) for wd in s_pre])

# 新し列としてAll_dfに追加
All_df.insert(loc=0, column='sen_id', value=sen_id)
All_df.insert(loc=1, column='sen_pre_id', value=sen_pre_id)

単語: EOS 	出現数: 29580 	ID: 0
単語: の 	出現数: 10567 	ID: 1
単語: て 	出現数: 7408 	ID: 2
単語: です 	出現数: 7390 	ID: 3
単語: ます 	出現数: 7363 	ID: 4
単語: か 	出現数: 7285 	ID: 5
単語: 、 	出現数: 6959 	ID: 6
単語: に 	出現数: 6914 	ID: 7
単語: が 	出現数: 6893 	ID: 8
単語: は 	出現数: 6793 	ID: 9
words kinds: 6961 words>=2: 6649


In [3]:
All_df

Unnamed: 0,sen_id,sen_pre_id,sen,sen_pre,say_id,reply_id,group_id,name,body,16types_a,16types_b,argument_a,argument_b,epistemic_a,epistemic_b,social_a,social_b,coordination_a,coordination_b
0,"[39, 35, 12, 4, 18, 0]",[0],"[よろしく, お願い, し, ます, ！, EOS]",[EOS],1,-1,1234568,まこぴす,よろしくお願いします！,5,5,1,1,1,1,0,0,0,0
1,"[39, 35, 12, 4, 0]","[39, 35, 12, 4, 18, 0]","[よろしく, お願い, し, ます, EOS]","[よろしく, お願い, し, ます, ！, EOS]",31,-1,1234568,哲,よろしくお願いします,5,5,1,1,1,1,0,0,0,0
2,"[414, 19, 1, 8, 2259, 3, 0, 39, 35, 12, 4, 18, 0]","[39, 35, 12, 4, 0]","[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...","[よろしく, お願い, し, ます, EOS]",70,-1,1234568,仙波,名前なのが恥ずかしいです…\nよろしくお願いします！,5,5,1,1,1,1,0,0,0,0
3,"[1835, 45, 143, 299, 56, 54, 27, 18, 0]","[414, 19, 1, 8, 2259, 3, 0, 39, 35, 12, 4, 18, 0]","[早速, 課題, やっ, ちゃ, い, ましょ, う, ！, EOS]","[名前, な, の, が, 恥ずかしい, です, EOS, よろしく, お願い, し, ます...",119,-1,1234568,まこぴす,早速課題やっちゃいましょう！,14,14,1,1,1,1,0,0,2,2
4,"[143, 299, 56, 54, 27, 0, 662, 9, 2355, 3, 18, 0]","[1835, 45, 143, 299, 56, 54, 27, 18, 0]","[やっ, ちゃ, い, ましょ, う, EOS, ｍｏｏｄｌｅ, は, ゴミ, です, ！,...","[早速, 課題, やっ, ちゃ, い, ましょ, う, ！, EOS]",163,-1,1234568,仙波,やっちゃいましょう\nmoodleはゴミです！,1,2,2,2,2,2,1,0,0,0
5,"[196, 903, 3, 112, 0]","[143, 299, 56, 54, 27, 0, 662, 9, 2355, 3, 18, 0]","[使い, にくい, です, 笑, EOS]","[やっ, ちゃ, い, ましょ, う, EOS, ｍｏｏｄｌｅ, は, ゴミ, です, ！,...",194,-1,1234568,まこぴす,使いにくいです(笑),2,2,2,2,2,2,1,0,0,0
6,"[1685, 3, 0]","[196, 903, 3, 112, 0]","[同意, です, EOS]","[使い, にくい, です, 笑, EOS]",302,-1,1234568,哲,同意です,1,1,2,2,2,2,2,2,0,0
7,"[1661, 1402, 4678, 730, 12, 2, 56, 10, 1, 3, 8...","[1685, 3, 0]","[以前, インタラクティブ, アート, 受講, し, て, い, た, の, です, が, ...","[同意, です, EOS]",309,-1,1234568,仙波,以前インタラクティブアート受講していたのですが、その時に課題が不具合で出せなくなっていた時期...,2,2,3,3,2,2,1,1,0,0
8,"[44, 424, 118, 168, 1866, 1, 181, 9, 4764, 2, ...","[1661, 1402, 4678, 730, 12, 2, 56, 10, 1, 3, 8...","[学習, 環境, として, 必要, 最低限, の, 機能, は, 備え, て, いる, と,...","[以前, インタラクティブ, アート, 受講, し, て, い, た, の, です, が, ...",385,-1,1234568,まこぴす,学習環境として必要最低限の機能は備えていると思うが、操作性の面ではPCの最低限の知識があるこ...,2,2,5,5,2,2,1,1,0,0
9,"[68, 1, 132, 3, 18, 112, 0]","[44, 424, 118, 168, 1866, 1, 181, 9, 4764, 2, ...","[自分, の, 意見, です, ！, 笑, EOS]","[学習, 環境, として, 必要, 最低限, の, 機能, は, 備え, て, いる, と,...",426,-1,1234568,まこぴす,自分の意見です！(笑),2,2,1,2,1,2,0,1,0,0


In [4]:
sen_id = All_df['sen_id'].values
sen_pre_id = All_df['sen_pre_id'].values
sen_all = np.hstack((sen_id, sen_pre_id))

max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

# word_vectors_size = 200
# model = dh.get_w2v_model()

max_sen_length: 292


In [5]:
All_df['sen_id'] = [x[:66] for x in All_df['sen_id']]
All_df['sen_pre_id'] = [x[:66] for x in All_df['sen_pre_id']]

sen_all = np.hstack((All_df['sen_id'].values, All_df['sen_pre_id'].values))
max_sen_length = max([len(sen) for sen in sen_all])
print("max_sen_length:", max_sen_length)

max_sen_length: 66


In [6]:
# データの整理（一致、重複）
print("全データ：", All_df.shape)
All_drop_df = All_df.drop_duplicates(subset=['body', 'name']).reset_index(drop=True)
print("全データ(重複排除)：", All_drop_df.shape, "\n")

# 16types-------------------------------------
t16_df = All_drop_df
print("t16有効データ(ラベルなし排除)：", t16_df.shape)
valid = t16_df.shape[0]
t16_df = t16_df[t16_df['16types_a'] == t16_df['16types_b']].reset_index(drop=True)
print("t16一致データ(a==b)：", t16_df.shape)
print("t16一致率：", t16_df.shape[0], "/", valid, "=", t16_df.shape[0] / valid)
print(Counter(t16_df['16types_a']), "\n")

全データ： (12012, 19)
全データ(重複排除)： (11357, 19) 

t16有効データ(ラベルなし排除)： (11357, 19)
t16一致データ(a==b)： (7926, 19)
t16一致率： 7926 / 11357 = 0.6978955710134719
Counter({1: 1867, 2: 1325, 5: 1046, 3: 900, 4: 885, 6: 779, 8: 236, 7: 227, 14: 179, 9: 141, 11: 108, 15: 102, 12: 63, 13: 30, 0: 21, 10: 17}) 



In [7]:
all_sen = t16_df['sen_id'].values
all_sen = np.array([np.array(x, dtype=np.int32) for x in all_sen])
# x = dh.sen_to_fv(all_sen, max_sen_length, model, False)
x = dh.sen_to_same_length(all_sen, max_sen_length)
print("input data(sen)：", x.shape)

all_sen_pre = t16_df['sen_pre_id'].values
all_sen_pre = np.array([np.array(x, dtype=np.int32) for x in all_sen_pre])
# x_pre = dh.sen_to_fv(all_sen_pre, max_sen_length, model, False)
x_pre = dh.sen_to_same_length(all_sen_pre, max_sen_length)
print("input data(sen_pre)：", x_pre.shape)

input data(sen)： (7926, 66)
input data(sen_pre)： (7926, 66)


In [8]:
label_t16 = t16_df['16types_a'].values
label_t16 = np.array(label_t16, dtype=np.int32)
y_t16 = dh.labels_to_one_hot(label_t16, 16)
print("input data(t16_a)：", y_t16.shape)

data = dh.set_data_sets(x, x_pre, y_t16)

print(data.test.vectors_1.shape)
print(data.train.vectors_1.shape)
print(data.test.vectors_2.shape)
print(data.train.vectors_2.shape)
print(data.test.labels_1.shape)
print(data.train.labels_1.shape)

input data(t16_a)： (7926, 16)
(793, 66)
(7133, 66)
(793, 66)
(7133, 66)
(793, 16)
(7133, 16)


In [9]:
# embedding_size = word_vectors_size
num_classes = 16
sequence_length = max_sen_length
num_steps = max_sen_length
embedding_size = 200
num_lstm_hidden = 800
num_fc_hidden_1 = num_lstm_hidden
num_fc_hidden_2 = 200

train_dropout = 0.5
test_dropout = 1.0

# learning_rate = 0.01
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
    0.01, global_step, 100, 0.95, staircase=True)

batch_size = 64
total_batch = int(data.train.num_examples / batch_size)
print("total_batch:", total_batch)
training_epochs = 40

# tf Graph input
X = tf.placeholder(dtype=tf.int32, shape=[None])
X_pre = tf.placeholder(dtype=tf.int32, shape=[None])
Y = tf.placeholder(dtype=tf.int32, shape=[None, num_classes])
keep_prob = tf.placeholder(tf.float32)



# Store layers weight & bias
weights = {
    'fc1': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_fc_hidden_1, num_fc_hidden_2])),
    'out': tf.Variable(tf.truncated_normal(stddev=0.1, shape=[num_fc_hidden_2, num_classes]))
}
biases = {
    'fc1': tf.Variable(tf.constant(value=0.1, shape=[num_fc_hidden_2])),
    'out': tf.Variable(tf.constant(value=0.1, shape=[num_classes]))
}

total_batch: 111


In [12]:
def Bi_S2S(a, b, weights, biases, dropout):
    cell = tf.nn.rnn_cell.BasicLSTMCell(num_lstm_hidden)
    
    outputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
            x_pre, x, cell, num_lstm_hidden, num_lstm_hidden, 200)

    # mean pooling
#     enc_output = tf.concat(enc_outputs, axis=2) # [batch_size,sequence_length,num_lstm_hidden*2]
#     dec_output = tf.concat(dec_outputs, axis=2)
#     enc_out = tf.reduce_mean(enc_output, axis=1) # [batch_size,num_lstm_hidden*2]
#     dec_out = tf.reduce_mean(dec_output, axis=1)
#     s2s_out = tf.concat([enc_out, dec_out], axis=1) # [batch_size,num_lstm_hidden*2*2]

    
#     enc_output = tf.concat(enc_outputs, axis=2) # [batch_size,sequence_length,num_lstm_hidden*2]
#     dec_output = tf.concat(dec_outputs, axis=2)
#     output = tf.concat([enc_output, dec_output], axis=2) # [batch_size,sequence_length,num_lstm_hidden*2*2]
#     s2s_out = tf.reduce_mean(output, axis=1) # [batch_size,num_lstm_hidden*2*2]
    
    
    # fully connected layer
    fc1 = tf.add(tf.matmul(outputs, weights['fc1']), biases['fc1'])
#     fc1 = tf.nn.relu(fc1)
#     fc1 = tf.nn.dropout(fc1, dropout)

    # before final softmax layer
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

In [13]:
y_pred = Bi_S2S(X, X_pre, weights, biases, keep_prob)

loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=y_pred))

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# Evaluate model
pred = tf.argmax(y_pred, 1)
true = tf.argmax(Y, 1)
correct_prediction = tf.equal(pred, true)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

TypeError: inputs must be a sequence

In [None]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

# Training cycle
all_test_x = data.test.vectors_1
all_test_x_pre = data.test.vectors_2
all_test_y = data.test.labels_1

start = time.time()

for epoch_i in range(training_epochs):
    ave_cost = 0
    for batch_i in range(total_batch):
        batch_x, batch_x_pre, batch_y = data.train.next_batch(batch_size, 1)
        g = (epoch_i) * total_batch + (batch_i + 1)
        _, c = sess.run(
            [optimizer, loss],
            feed_dict={
                X: batch_x,
                X_pre: batch_x_pre,
                Y: batch_y,
                global_step: g,
                keep_prob: train_dropout
            })
        ave_cost += c / total_batch
    if epoch_i % 1 == 0:
        train_acc = sess.run(
            accuracy,
            feed_dict={
                X: batch_x,
                X_pre: batch_x_pre,
                Y: batch_y,
                keep_prob: test_dropout
            })

        test_acc = sess.run(
            accuracy,
            feed_dict={
                X: all_test_x,
                X_pre: all_test_x_pre,
                Y: all_test_y,
                keep_prob: test_dropout
            })
        print("Epoch:%3d Batch:%4d" % (epoch_i + 1, batch_i + 1),
              "| Train_loss=%5.3f" % ave_cost, "Train_acc=%.3f" % train_acc,
              "Test_acc=%.3f" % test_acc)

end = time.time()
print("Process Time :%.2f s" % (end - start))
sess.close()