In [1]:
from __future__ import print_function
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework.ops import reset_default_graph
from tensorflow.contrib.layers import fully_connected, batch_norm
import tensorflow.contrib.rnn as rnn

from utils import *

Run the following code to apply the slurm restriction

In [2]:
import os
NUM_THREADS = int(os.environ['OMP_NUM_THREADS'])
sess = tf.Session(config=tf.ConfigProto(
    intra_op_parallelism_threads=NUM_THREADS,
    inter_op_parallelism_threads=NUM_THREADS))

Convenient functions to show the running time

In [3]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

All the hyperparameters

In [4]:
batch_size = 64
number_inputs = 66
number_outputs = 8
seq_len = 400 # max 700

Load all the data

In [5]:
# Get all the data
trainList_addr = './data/trainList'
validList_addr = './data/validList'
testList_addr = './data/testList'

start = time.time()
train_list, train_len_list = read_list(trainList_addr)
valid_list, valid_len_list = read_list(validList_addr)
test_list, test_len_list = read_list(testList_addr)

train_generator = generate_batch(train_list, train_len_list,
                                 max_seq_length=seq_len,
                                 batch_size=batch_size)

X_train, t_train, len_train, mask_train = train_generator.next()
# X_train, t_train, len_train = read_data(train_list, train_len_list,
#                                         max_seq_length=seq_len)
X_valid, t_valid, len_valid, mask_valid = read_data(valid_list, valid_len_list,
                                        max_seq_length=max(valid_len_list))
X_test, t_test, len_test, mask_test = read_data(test_list, test_len_list,
                                     max_seq_length=max(test_len_list))

timeSpent = time.time() - start
print("Time spent loading data: {}".format(asMinutes(timeSpent)))
print("X_train:", X_train.shape)
print("X_valid:", X_valid.shape)
print("X_test:", X_test.shape)

Time spent loading data: 0m 41s
X_train: (64, 400, 66)
X_valid: (1267, 683, 66)
X_test: (1267, 687, 66)


In [6]:
print(t_test.shape)
print(mask_test.shape)

(1267, 687)
(1267, 687)


In [7]:
learning_rate = 0.001
num_layers = 3
state_size = 100
num_units_l1 = 100
dropout = False
dropout_keep_rate = 0.5
clip_gradients = True
max_grad_norm = 5

reset_default_graph()

X_input = tf.placeholder(tf.float32, shape=[None, None, number_inputs], name='X_input')
X_length = tf.placeholder(tf.int32, shape=[None,], name='X_length')
t_input = tf.placeholder(tf.int32, shape=[None, None], name='t_input')
X_mask = tf.placeholder(tf.int32, shape=[None, None], name='X_mask')

def GRU_with_dropout(dropout=True):
    if dropout:
        return rnn.DropoutWrapper(rnn.GRUCell(state_size),
                                  output_keep_prob=dropout_keep_rate)
    else:
        return rnn.GRUCell(state_size)

cells = rnn.MultiRNNCell([GRU_with_dropout(dropout=dropout) for _ in range(num_layers)],
                         state_is_tuple=True)
# print(cells.state_size)
# init_state = (tf.zeros([batch_size, state_size]), ) * 3
rnn_outputs, output_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cells, cell_bw=cells, inputs=X_input,
                                                 sequence_length=X_length, 
#                                                  initial_state_fw=init_state, initial_state_bw=init_state,
                                                 dtype=tf.float32)
# print(output_state[0][0].shape)
enc_outputs = tf.concat(rnn_outputs, 2)
outputs = tf.reshape(enc_outputs, [-1, state_size*2])
l1 = fully_connected(outputs, num_units_l1, normalizer_fn=batch_norm)
l_out = fully_connected(l1, number_outputs, activation_fn=None)

batch_size_shp = tf.shape(enc_outputs)[0]
seq_len_shp = tf.shape(enc_outputs)[1]
l_out_reshape = tf.reshape(l_out, [batch_size_shp, seq_len_shp, number_outputs])

y = l_out_reshape

The following code use softmax_cross_entropy_with_logits to resolve the NaN problem.

**This piece is working!!!**

The following code uses sparse_softmax_cross_entropy_with_logits but with number_outputs = 9. 

In [9]:
# Derive the mask to filter out zero-padding when calculating loss and accuracy
X_mask = tf.to_float(X_mask)
mask_sum = tf.reduce_sum(X_mask)
# Calculate loss
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=t_input, logits=y)
# print(cross_entropy.dtype)
cross_entropy *= X_mask
# tf.reduce_mean cannot be used because it will count zero-padding for weights.
loss = tf.reduce_sum(cross_entropy) / mask_sum
# Calculate accuracy. Need to exclude the padded zeros.

predictions = tf.to_int32(tf.argmax(y, 2))
correct = tf.to_float(tf.equal(predictions, t_input))
total_correct_preds = tf.reduce_sum(correct * X_mask)
accuracy =  total_correct_preds / mask_sum

# use global step to keep track of our iterations
global_step = tf.Variable(0, name='global_step', trainable=False)
# pick optimizer, try momentum or adadelta
optimizer = tf.train.AdamOptimizer(learning_rate)
# extract gradients for each variable
grads_and_vars = optimizer.compute_gradients(loss)
grads = [element[0] for element in grads_and_vars]
variables =  [element[1] for element in grads_and_vars]
if clip_gradients:
    grads = tf.clip_by_global_norm(grads, max_grad_norm)[0]
    
grad_norm = tf.global_norm(grads)
grads_and_vars = [(grads[i], variables[i]) for i in range(len(grads))]
# apply gradients and make trainable function
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

In [None]:
# print("=" * 10 + "validating the model"+ "=" * 10)
# # test validation part
# # sess.run(tf.global_variables_initializer())
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     feed_dict = {X_input: X_valid, X_length: len_valid,
#                  t_input: t_valid, X_mask: mask_valid}
#     fetches = [y, t_input, predictions, cross_entropy, enc_outputs, rnn_outputs]
#     res = tuple(sess.run(fetches=fetches, feed_dict=feed_dict))
#     print("y:", res[0].shape)
#     print("t_input", res[1].shape)
#     print("predictions", res[2].shape)
#     print("Cross Entropy", res[3].shape)
#     print("enc_outputs", res[4].shape)
#     print("rnn_outputs", res[5][0].shape, res[5][1].shape)
#     # print(len_valid[0])
#     # print(res[5][0][0, 144:147, 0:10])
# print("=" * 10 + "Model validation finished"+ "=" * 10)

In [23]:
def evaluate(X_data, y_data, len_data, mask_data):
    num_examples = X_data.shape[0]
    sess = tf.get_default_session()
    total_correct = 0
    for offset in range(0, num_examples, batch_size):
        batch_x = X_data[offset:offset+batch_size, :, :]
        batch_y = y_data[offset:offset+batch_size, :]
        batch_len = len_data[offset:offset+batch_size]
        batch_mask = mask_data[offset:offset+batch_size, :]
        
        feed = {X_input: batch_x, X_length: batch_len,
                t_input: batch_y, X_mask: batch_mask}
        
        res = sess.run(total_correct_preds, feed_dict=feed_dict_val)
        total_correct += res
    return total_correct / np.sum(mask_data)

In [24]:
val_interval = batch_size * 5
print_interval = batch_size * 10
samples_to_process = 5e3
early_stopping = True
patience = 4
patience_count = 0

samples_processed = 0
samples_val = []
costs, accs_val, grads_norm = [], [], []
saver1 = tf.train.Saver()

sess = tf.Session()
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('./graphs', sess.graph)
while samples_processed < samples_to_process:
    fetches_tr = [train_op, loss, accuracy, grad_norm]
    feed_dict_tr = {X_input: X_train, X_length: len_train,
                    t_input: t_train, X_mask: mask_train}
    res = tuple(sess.run(fetches=fetches_tr, feed_dict=feed_dict_tr))
    _, batch_cost, batch_acc, batch_grad_norm = res
    samples_processed += batch_size
    grads_norm += [batch_grad_norm]

    #validation data
    if samples_processed % val_interval == 0:
        acc_val = evaluate(X_valid, t_valid, len_valid, mask_valid)
        costs += [batch_cost]
        samples_val += [samples_processed]
        accs_val += [acc_val]

        if samples_processed % print_interval == 0:
            print("samples_processed: %d, batch_cost: %.3f, validation_accs: %.4f, patience_count: %d" % \
                  (samples_processed, batch_cost, acc_val, patience_count))

        if early_stopping:
            if len(accs_val) > patience and acc_val < accs_val[-2]:
                patience_count += 1
            if patience_count >= patience:
                break

saver1.save(sess, './bi_gru_300_dp')
writer.close()

samples_processed: 640, batch_cost: 1.433, validation_accs: 0.5035, patience_count: 0
samples_processed: 1280, batch_cost: 1.163, validation_accs: 0.6197, patience_count: 0
samples_processed: 1920, batch_cost: 0.982, validation_accs: 0.6684, patience_count: 0
samples_processed: 2560, batch_cost: 0.910, validation_accs: 0.6876, patience_count: 0
samples_processed: 3200, batch_cost: 0.873, validation_accs: 0.7014, patience_count: 0
samples_processed: 3840, batch_cost: 0.845, validation_accs: 0.7052, patience_count: 0
samples_processed: 4480, batch_cost: 0.830, validation_accs: 0.7079, patience_count: 1


RuntimeError: Attempted to use a closed Session.

In [None]:
# fetches_test = [accuracy, loss]
# feed_dict_test = {X_input: X_test, X_length: len_test,
#                   t_input: t_test, X_mask: mask_test}
# res_test = tuple(sess.run(fetches=fetches_test, feed_dict=feed_dict_test))

# acc_test, loss_test = res_test
# print("Test Accuracy: {:.4f}".format(acc_test))
# # print(type(gradients_and_vars))
# # print(predictions[0, :])
# # print(t_test[0, :])

In [None]:
# with open("feat66_len300_dynamic_padding.txt","w") as f:
#     for (sample_val, acc_val, cost) in zip(samples_val, accs_val, costs):
#         f.write("{0},{1},{2}\n".format(sample_val, acc_val, cost))

In [None]:
fig, ax1 = plt.subplots()
plt.plot(samples_val, accs_val, 'b-')
ax1.set_ylabel('Validation Accuracy', fontsize=15)
ax1.set_xlabel('Processed samples', fontsize=15)
plt.title('Accuracy & Cost with seq_len=300, dynamic padding', fontsize=20)
ax2 = ax1.twinx()
ax2.plot(samples_val, costs, 'r-')
ax2.set_ylabel('Training Cost', fontsize=15)
plt.grid('on')
plt.savefig("out.png")
plt.show()

In [None]:
print(len(grads_norm))
plt.plot(np.arange(len(grads_norm)), grads_norm)
plt.show()

The following code is used to plot the stored information for old experiment and is not being used currently.

In [None]:
# samples_30, accs_30, costs_30 = [], [], []
# with open("feat66_len30.txt","r") as f:
#     for line in f:
#         line = line.split(',')
#         samples_30.append(line[0])
#         accs_30.append(line[1])
#         costs_30.append(line[2])

In [None]:
# fig, ax1 = plt.subplots()
# plt.plot(samples_30, accs_30, 'b-')
# ax1.set_ylabel('Validation Accuracy', fontsize=15)
# ax1.set_xlabel('Processed samples', fontsize=15)
# plt.title('Accuracy & Cost with seq_len=30', fontsize=20)
# ax2 = ax1.twinx()
# ax2.plot(samples_30, costs_30, 'r-')
# ax2.set_ylabel('Training Cost', fontsize=15)
# plt.grid('on')
# plt.savefig("out.png")
# plt.show()