In [1]:
import numpy as np

def load_dataset():
    # 学習データ
    x_train = np.load('../dataset/x_train.npy')
    t_train = np.load('../dataset/t_train.npy')
    
    # テストデータ
    x_test = np.load('../dataset/x_test.npy')

    return (x_train, x_test, t_train)

x_train, x_test, t_train = load_dataset()
print(x_test.shape)
print(x_test[1])

import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.preprocessing.sequence import pad_sequences

### レイヤー定義 ###
class Embedding:
    def __init__(self, vocab_size, emb_dim, scale=0.08):
        self.V = tf.Variable(tf.random_normal([vocab_size, emb_dim], stddev=scale), name='V')

    def __call__(self, x):
        return tf.nn.embedding_lookup(self.V, x)
    
class RNN:
    def __init__(self, in_dim, hid_dim, seq_len=None, scale=0.08):
        self.in_dim = in_dim
        self.hid_dim = hid_dim
        
        glorot = tf.cast(tf.sqrt(6/(in_dim + hid_dim*2)), tf.float32)
        self.W = tf.Variable(tf.random_uniform([in_dim+hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W')
        self.b = tf.Variable(tf.zeros([hid_dim]), name='b')
        
        self.seq_len = seq_len
        self.initial_state = None

    def __call__(self, x):
        def fn(h_prev, x_and_m):
            x_t, m_t = x_and_m
            inputs = tf.concat([x_t, h_prev], -1)
            # RNN
            h_t = tf.nn.tanh(tf.matmul(inputs, self.W) + self.b)
            # マスクの適用
            h_t = m_t * h_t + (1 - m_t) * h_prev
          
            return h_t

        # 入力の時間順化
        # shape: [batch_size, max_seqence_length, in_dim] -> [max_seqence_length, batch_size, in_dim]
        x_tmaj = tf.transpose(x, perm=[1, 0, 2])
        
        # マスクの生成＆時間順化,パディングした部分がスキップされる仕掛けらしい
        mask = tf.cast(tf.sequence_mask(self.seq_len, tf.shape(x)[1]), tf.float32)
        mask_tmaj = tf.transpose(tf.expand_dims(mask, axis=-1), perm=[1, 0, 2])
        
        if self.initial_state is None:
            batch_size = tf.shape(x)[0]
            self.initial_state = tf.zeros([batch_size, self.hid_dim])
        
        h = tf.scan(fn=fn, elems=[x_tmaj, mask_tmaj], initializer=self.initial_state)
        
        return h[-1]

def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))

class LSTM:
    def __init__(self, in_dim, hid_dim, seq_len = None, initial_state = None):
        self.in_dim = in_dim
        self.hid_dim = hid_dim

        glorot = tf.cast(tf.sqrt(6/(in_dim + hid_dim*2)), tf.float32)
        
        # 入力ゲート
        self.W_i = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_i')
        self.b_i  = tf.Variable(tf.zeros([hid_dim]), name='b_i')
        
        # 忘却ゲート
        self.W_f = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_f')
        self.b_f  = tf.Variable(tf.zeros([hid_dim]), name='b_f')

        # 出力ゲート
        self.W_o = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_o')
        self.b_o  = tf.Variable(tf.zeros([hid_dim]), name='b_o')

        # セル
        self.W_c = tf.Variable(tf.random_uniform([in_dim + hid_dim, hid_dim], minval=-glorot, maxval=glorot), name='W_c')
        self.b_c  = tf.Variable(tf.zeros([hid_dim]), name='b_c')

        # マスク
        self.seq_len = seq_len
        
        self.initial_state = initial_state

    def __call__(self, x):
        def fn(prev_state, x_and_m):
            c_prev, h_prev = prev_state[0], prev_state[1]
            x_t, m_t = x_and_m
            
            inputs = tf.concat([x_t, h_prev], -1)
            
            # 各ゲート
            i_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_i) + self.b_i)
            f_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_f) + self.b_f)
            o_t = tf.nn.sigmoid(tf.matmul(inputs, self.W_o) + self.b_o)

            # セル
            c_t = f_t * c_prev + i_t * tf.nn.tanh(tf.matmul(inputs, self.W_c) + self.b_c)

            # 隠れ状態
            h_t = o_t * tf.nn.tanh(c_t)
            
            # マスクの適用
            c_t = m_t * c_t + (1 - m_t) * c_prev
            h_t = m_t * h_t + (1 - m_t) * h_prev

            return tf.stack([c_t, h_t])

        # 入力の時間順化
        x_tmaj = tf.transpose(x, perm=[1, 0, 2])
        
        # マスクの生成＆時間順化
        mask = tf.cast(tf.sequence_mask(self.seq_len, tf.shape(x)[1]), tf.float32)
        mask_tmaj = tf.transpose(tf.expand_dims(mask, axis=-1), perm=[1, 0, 2])
        
        if self.initial_state is None:
            batch_size = tf.shape(x)[0]
            self.initial_state = tf.stack([tf.zeros([batch_size, self.hid_dim]), tf.zeros([batch_size, self.hid_dim])])

        state_seq = tf.scan(fn=fn, elems=[x_tmaj, mask_tmaj], initializer=self.initial_state)
        
        return state_seq[-1][1]


### グラフ構築 ###
tf.reset_default_graph() # グラフ初期化

emb_dim = 100
hid_dim = 50
pad_index = 0
num_words = max([max(s) for s in np.hstack((x_train, x_test))])
x = tf.placeholder(tf.int32, [None, None], name='x')
t = tf.placeholder(tf.float32, [None, None], name='t')

seq_len = tf.reduce_sum(tf.cast(tf.not_equal(x, pad_index), tf.int32), axis=1)

h = Embedding(num_words, emb_dim)(x)
h = LSTM(emb_dim, hid_dim, seq_len)(h)
y = tf.layers.Dense(1, tf.nn.sigmoid)(h)

cost = -tf.reduce_mean(t*tf_log(y) + (1 - t)*tf_log(1 - y))

train = tf.train.AdamOptimizer().minimize(cost)
test = tf.round(y)

### データの準備 ###
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train)

### 学習 ###
n_epochs = 1
batch_size = 50 # バッチサイズが大きいと、ResourceExhaustedErrorになることがあります

n_batches_train = len(x_train) // batch_size
n_batches_valid = len(x_valid) // batch_size
print(n_batches_train )
print(n_batches_valid)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

for epoch in range(n_epochs):
    # Train
    train_costs = []
    for i in range(n_batches_train):
        start = i * batch_size
        end = start + batch_size

        x_train_batch = np.array(pad_sequences(x_train[start:end], padding='post', value=pad_index)) # バッチ毎のPadding
        t_train_batch = np.array(t_train[start:end])[:, None]

        _, train_cost = sess.run([train, cost], feed_dict={x: x_train_batch, t: t_train_batch})
        train_costs.append(train_cost)

    # Valid
    valid_costs = []
    y_pred = []
    print("finish pre!")
    for i in range(n_batches_valid):
        start = i * batch_size
        end = start + batch_size

        x_valid_pad = np.array(pad_sequences(x_valid[start:end], padding='post', value=pad_index)) # バッチ毎のPadding
        t_valid_pad = np.array(t_valid[start:end])[:, None]

        pred, valid_cost = sess.run([test, cost], feed_dict={x: x_valid_pad, t: t_valid_pad})

        y_pred = pred.flatten().tolist()
        valid_costs.append(valid_cost)
        print('EPOCH: %i, Training Cost: %.3f, Validation Cost: %.3f, Validation F1: %.3f' % (epoch+i, np.mean(train_costs), np.mean(valid_costs), f1_score(t_valid_pad, y_pred, average='macro')))
            
### 出力 ###
x_test_pad1 = np.array(pad_sequences(x_test[:2500], padding='post', value=pad_index)) # バッチ毎のPadding
y_pred_final1 = sess.run(test, feed_dict={x: x_test_pad1})
x_test_pad2 = np.array(pad_sequences(x_test[2500:5000], padding='post', value=pad_index)) # バッチ毎のPadding
y_pred_final2 = sess.run(test, feed_dict={x: x_test_pad2})
x_test_pad3 = np.array(pad_sequences(x_test[5000:7500], padding='post', value=pad_index)) # バッチ毎のPadding
y_pred_final3 = sess.run(test, feed_dict={x: x_test_pad3})

y_pred_final=y_pred_final1#+y_pred_final2+y_pred_final3#+y_pred_final4
y_pred_final2=y_pred_final2
print("y_pred_final2 before:")
print(y_pred_final2.shape)
print(y_pred_final2[1])
#y_pred = np.argmax(y_pred, axis=1)
y_pred = y_pred_final.flatten().tolist()
y_pred2 = y_pred_final2.flatten().tolist()
print("y_pred_final2 after:")
print(y_pred_final2.shape)
print(y_pred_final2[1])

submission = pd.Series(y_pred, name='label')
submission.to_csv('../dataset/y_pred1_submission_pred.csv', header=True, index_label='id')
submission = pd.Series(y_pred2, name='label')
submission.to_csv('../dataset/y_ped2_submission_pred.csv', header=True, index_label='id')
sess.close()

(10000,)
[1, 207, 460, 4293, 23, 14, 22, 467, 4, 403, 21, 13, 67, 50, 26, 409, 37, 28, 4293, 237, 32, 13, 70, 135, 9, 51, 4, 1209, 2295, 13, 566, 264, 15, 6, 226, 3248, 84, 28, 4293, 23, 14, 22, 42, 60, 110, 14, 20, 763, 8, 15, 4, 192, 15, 6, 378, 202, 12, 87, 857, 242, 4, 1188, 7, 4, 22, 37, 435, 8, 31, 7, 148, 1370, 11, 6, 3255, 4482, 15, 1754, 3077, 2469, 5, 1040, 6, 52, 733, 319, 17, 89, 50, 9, 57, 96, 36, 100, 42, 62, 990, 18, 68, 205, 3077, 24710, 43, 168, 33, 68, 20, 261, 13, 131, 974, 13, 188, 6, 1796, 1466, 18, 14, 20, 13, 197, 15, 49, 7, 4, 85, 84, 487, 44, 14, 31, 238, 28, 12, 1471, 19, 160, 366, 13, 332, 4, 857, 262, 4, 415, 37, 8509, 4, 4349, 347, 23, 4, 1011, 7, 4, 953, 4, 2095, 11, 6073, 19773, 51, 4, 1209, 2295, 9, 15, 424, 8, 384, 553, 6073, 19773, 13, 484, 2738, 44, 15, 366, 13, 332, 12, 11, 4, 733, 84, 75, 26, 6, 12226, 171, 13, 135, 75, 909, 280, 6, 291, 8, 650, 14, 22, 858, 127, 259, 131, 28, 12, 48, 259, 127, 28, 12, 591, 3154, 72, 146, 8, 79, 1819]


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


600
200
finish pre!
EPOCH: 0, Training Cost: 0.477, Validation Cost: 0.328, Validation F1: 0.900
EPOCH: 1, Training Cost: 0.477, Validation Cost: 0.369, Validation F1: 0.840
EPOCH: 2, Training Cost: 0.477, Validation Cost: 0.341, Validation F1: 0.900
EPOCH: 3, Training Cost: 0.477, Validation Cost: 0.319, Validation F1: 0.898
EPOCH: 4, Training Cost: 0.477, Validation Cost: 0.335, Validation F1: 0.820
EPOCH: 5, Training Cost: 0.477, Validation Cost: 0.339, Validation F1: 0.840
EPOCH: 6, Training Cost: 0.477, Validation Cost: 0.354, Validation F1: 0.758
EPOCH: 7, Training Cost: 0.477, Validation Cost: 0.356, Validation F1: 0.833
EPOCH: 8, Training Cost: 0.477, Validation Cost: 0.351, Validation F1: 0.859
EPOCH: 9, Training Cost: 0.477, Validation Cost: 0.345, Validation F1: 0.838
EPOCH: 10, Training Cost: 0.477, Validation Cost: 0.350, Validation F1: 0.839
EPOCH: 11, Training Cost: 0.477, Validation Cost: 0.355, Validation F1: 0.859
EPOCH: 12, Training Cost: 0.477, Validation Cost: 0.34

EPOCH: 105, Training Cost: 0.477, Validation Cost: 0.356, Validation F1: 0.919
EPOCH: 106, Training Cost: 0.477, Validation Cost: 0.357, Validation F1: 0.800
EPOCH: 107, Training Cost: 0.477, Validation Cost: 0.357, Validation F1: 0.860
EPOCH: 108, Training Cost: 0.477, Validation Cost: 0.356, Validation F1: 0.940
EPOCH: 109, Training Cost: 0.477, Validation Cost: 0.357, Validation F1: 0.820
EPOCH: 110, Training Cost: 0.477, Validation Cost: 0.358, Validation F1: 0.797
EPOCH: 111, Training Cost: 0.477, Validation Cost: 0.357, Validation F1: 0.898
EPOCH: 112, Training Cost: 0.477, Validation Cost: 0.359, Validation F1: 0.814
EPOCH: 113, Training Cost: 0.477, Validation Cost: 0.358, Validation F1: 0.919
EPOCH: 114, Training Cost: 0.477, Validation Cost: 0.358, Validation F1: 0.820
EPOCH: 115, Training Cost: 0.477, Validation Cost: 0.359, Validation F1: 0.799
EPOCH: 116, Training Cost: 0.477, Validation Cost: 0.359, Validation F1: 0.900
EPOCH: 117, Training Cost: 0.477, Validation Cost: 0