In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
# Hyperpameter
NUM_WORDS = 10000
SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
HIDDEN_SIZE = 150
ATTENTION_SIZE = 2
N_ATTENTION_HEAD = 2
KEEP_PROB = 0.8
BATCH_SIZE = 256
NUM_EPOCHS = 5

### 讀取數據

In [3]:
(X_train , y_train) , (X_test , y_test) =\
imdb.load_data(num_words = NUM_WORDS)

# 若是numpy版本超過1.16.2，可以用下列指令讀取數據
# np_load_old = np.load
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
# (X_train , y_train) , (X_test , y_test) =\
# imdb.load_data(num_words = NUM_WORDS)
# np.load = np_load_old 

### 數據預處理

In [4]:
# 「加1」代表把'pad'算進去
vocabulary_size = max([max(x) for x in X_train]) + 1
X_test = [[w for w in x if w < vocabulary_size] for x in X_test]
X_train_ = []
for i , sentence in enumerate(X_train):
    if len(sentence) < SEQUENCE_LENGTH:
        # 「0」代表'pad'
        sentence += [0] * (SEQUENCE_LENGTH - len(sentence))
    else:
        sentence = sentence[:SEQUENCE_LENGTH]
    X_train_.append(sentence)

X_test_ = []
for i , sentence in enumerate(X_test):
    if len(sentence) < SEQUENCE_LENGTH:
        sentence += [0] * (SEQUENCE_LENGTH - len(sentence))
    else:
        sentence = sentence[:SEQUENCE_LENGTH]
    X_test_.append(sentence)

X_train = np.array(X_train_)
X_test = np.array(X_test_)

In [5]:
# 輸入層
input_data = tf.placeholder(tf.int32 , [None , SEQUENCE_LENGTH])
target = tf.placeholder(tf.float32 , [None])
seq_len = tf.placeholder(tf.int32 , [None])
keep_prob = tf.placeholder(tf.float32)

In [6]:
def lstm_cell(rnn_size):
    cell = tf.contrib.rnn.LSTMCell(rnn_size)
    return cell

def self_attention(embed_input , attention_size , n_attention_head):
    # embed_input [? , 250 , k]
    k = embed_input.shape[-1].value
    initializer = tf.random_normal_initializer(stddev = 0.1)

    # 儲存多個self-attention的結果
    attention_heads = []
    W_Q = []
    W_K = []
    W_V = []
    
    # 1.構建多個attention
    for i in range(0 , n_attention_head):
        # 初始化W_Q , W_K , W_V
        W_Q.append(tf.Variable(initializer([k , attention_size]) , name = 'query_{}'.format(i)))  # [k , attention_size]
        W_K.append(tf.Variable(initializer([k , attention_size]) , name = 'key_{}'.format(i)))    # [k , attention_size]
        W_V.append(tf.Variable(initializer([k , attention_size]) , name = 'value_{}'.format(i)))  # [k , attention_size]
     
    for i in range(0 , n_attention_head):
        # 映射到attention_size維空間
        embed_q = tf.tensordot(embed_input , W_Q[i] , axes = 1)  # [? , 250 , attention_size]
        embed_k = tf.tensordot(embed_input , W_K[i] , axes = 1)  # [? , 250 , attention_size]
        embed_v = tf.tensordot(embed_input , W_V[i] , axes = 1)  # [? , 250 , attention_size]

        # 計算attention
        score = tf.matmul(embed_q , tf.transpose(embed_k , [0 , 2 , 1]))  # [? , 250 , 250]
        attention = tf.nn.softmax(score , axis = -1)  # [? , 250 , 250]
    
        attention_output = tf.matmul(attention , embed_v)  # [? , 250 , attention_size]
        attention_heads.append(attention_output)
    
    # 2.concat multi head
    multi_attention_output = tf.concat(attention_heads , axis = -1)  # [? , 250 , n_attention_head * attention_size]

    # 3.ResNet
    w_res = tf.Variable(initializer([k , attention_size * n_attention_head]) , name = 'resnet')  # [k , n_attention_head * attention_size]
    output = tf.nn.relu(multi_attention_output + tf.tensordot(embed_input , w_res , axes = 1))  # [? , 250 , n_attention_head * d]

    return output
    
def batch_generator(X , y , batch_size):
    size = X.shape[0]
    X_copy = X.copy()
    y_copy = y.copy()
    indices = np.arange(size)
    np.random.shuffle(indices)
    X_copy = X_copy[indices]
    y_copy = y_copy[indices]
    i = 0
    while True:
        if i + batch_size <= size:
            yield X_copy[i : i + batch_size] , y_copy[i : i + batch_size]
            i += batch_size
        else:
            i = 0
            indices = np.arange(size)
            np.random.shuffle(indices)
            X_copy = X_copy[indices]
            y_copy = y_copy[indices]

def compute_sequence_length(x_batch):
    seq_len = []
    for x in x_batch:
        count = 0
        for word in x:
            if word != 0:
                count += 1
            if word == 0:
                break
        seq_len.append(count)
    return np.array(seq_len)    

In [7]:
# Embedding layer
with tf.variable_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size , EMBEDDING_DIM] , -1.0 , 1.0))
    batch_embedded = tf.nn.embedding_lookup(embeddings_var , input_data)

# LSTM layer
with tf.variable_scope('LSTM'):
    cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell(HIDDEN_SIZE) for _ in range(1)])
            
    cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell(HIDDEN_SIZE) for _ in range(1)])
    
    lstm_outputs , states = tf.nn.bidirectional_dynamic_rnn(cell_fw ,
                                                            cell_bw ,
                                                            batch_embedded ,
                                                            sequence_length = seq_len ,
                                                            dtype = tf.float32)
    lstm_outputs = tf.concat(lstm_outputs , axis = -1)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument i

In [8]:
# Self Attention layer
with tf.variable_scope('Self_Attention_layer'):
    attention_output = lstm_outputs
    n_layers = 2
    for i in range(0 , n_layers):
        attention_output = self_attention(attention_output ,
                                          attention_size = ATTENTION_SIZE ,
                                          n_attention_head = N_ATTENTION_HEAD)
    attention_output = tf.reshape(attention_output , [-1 , SEQUENCE_LENGTH * ATTENTION_SIZE * 2])  

# Fully connected layer
with tf.variable_scope('Fully_connected_layer'):
    W = tf.Variable(tf.truncated_normal([SEQUENCE_LENGTH * ATTENTION_SIZE * 2 , 1]  , stddev = 0.1))
    b = tf.Variable(tf.constant(0. , shape = [1]))
    y_pred = tf.nn.xw_plus_b(attention_output , W , b)
    y_pred = tf.reshape(y_pred , [-1 , ])

with tf.variable_scope('optimizer'):
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = y_pred , labels = target))
    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
    correct = tf.cast(tf.equal(tf.round(tf.sigmoid(y_pred)) , target)  , tf.float32)
    accuracy = tf.reduce_mean(correct)

In [9]:
# Batch generators
train_batch_generator = batch_generator(X_train , y_train , BATCH_SIZE)
test_batch_generator = batch_generator(X_test , y_test , BATCH_SIZE)

In [10]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(0 , NUM_EPOCHS):
    loss_train = 0
    loss_test = 0
    accuracy_train = 0
    accuracy_test = 0

    # Training
    num_batches = X_train.shape[0] // BATCH_SIZE
    for batch_i in range(0 , num_batches):
        x_batch , y_batch = next(train_batch_generator)
        seq_len_ = compute_sequence_length(x_batch)

        loss_train_batch , acc_train_batch , _ =\
        sess.run([loss , accuracy , optimizer],
                 feed_dict = {input_data : x_batch ,
                              target : y_batch ,
                              seq_len : seq_len_ ,
                              keep_prob : KEEP_PROB})

        accuracy_train += acc_train_batch
        loss_train += loss_train_batch

        if batch_i % 45 == 0:
            print('=' * 30)
            print('epoch: {}'.format(epoch))
            print('batch_i : {}'.format(batch_i))
            print('train_loss : {:.2f}'.format(loss_train_batch))
            print('train_accuracy : {:.2%}'.format(acc_train_batch))

    accuracy_train /= num_batches
    loss_train /= num_batches

    # Testing
    num_batches = X_test.shape[0] // BATCH_SIZE
    for batch_i in range(0 , num_batches):
        x_batch , y_batch = next(test_batch_generator)
        seq_len_ = compute_sequence_length(x_batch)

        loss_test_batch , acc_test_batch =\
        sess.run([loss , accuracy],
                 feed_dict ={input_data : x_batch ,
                             target : y_batch ,
                             seq_len : seq_len_ ,
                             keep_prob : 1.0})
        if batch_i % 45 == 0:
            print('=' * 30)
            print('epoch : {}'.format(epoch))
            print('batch_i : {}'.format(batch_i))
            print('test_loss : {:.2f}'.format(loss_test_batch))
            print('test_accuracy : {:.2%}'.format(acc_test_batch))
        accuracy_test += acc_test_batch
        loss_test += loss_test_batch

    accuracy_test /= num_batches
    loss_test /= num_batches
    print('*' * 30)
    print('epoch: {}'.format(epoch))
    print('train_loss_mean : {:.2f} , test_loss_mean : {:.2f}'.format(loss_train , loss_test))
    print('train_accuracy_mean : {:.2%} , test_accuracy_mean : {:.2%}'.format(accuracy_train , accuracy_test))
    print('*' * 30)

epoch: 0
batch_i : 0
train_loss : 0.69
train_accuracy : 50.00%
epoch: 0
batch_i : 45
train_loss : 0.57
train_accuracy : 70.31%
epoch: 0
batch_i : 90
train_loss : 0.54
train_accuracy : 73.05%
epoch : 0
batch_i : 0
test_loss : 0.47
test_accuracy : 77.34%
epoch : 0
batch_i : 45
test_loss : 0.48
test_accuracy : 78.91%
epoch : 0
batch_i : 90
test_loss : 0.42
test_accuracy : 83.59%
******************************
epoch: 0
train_loss_mean : 0.56 , test_loss_mean : 0.46
train_accuracy_mean : 69.74% , test_accuracy_mean : 78.79%
******************************
epoch: 1
batch_i : 0
train_loss : 0.40
train_accuracy : 83.59%
epoch: 1
batch_i : 45
train_loss : 0.35
train_accuracy : 84.38%
epoch: 1
batch_i : 90
train_loss : 0.27
train_accuracy : 88.67%
epoch : 1
batch_i : 0
test_loss : 0.36
test_accuracy : 83.98%
epoch : 1
batch_i : 45
test_loss : 0.38
test_accuracy : 83.59%
epoch : 1
batch_i : 90
test_loss : 0.40
test_accuracy : 82.03%
******************************
epoch: 1
train_loss_mean : 0.37 , 