In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics

w2v_size = 9000
w2v_file = '../DataSets/Quora/w2v_res_180119.pickle'
train_file = '../DataSets/Quora/train.csv'

In [4]:
%%time
with open(w2v_file, 'rb') as f:
    (full_dict, full_sentences, full_w2v) = pickle.load(f)
    
full_seqs = list(zip(full_sentences[:(len(full_sentences)//2)], full_sentences[(len(full_sentences)//2):]))
    
full_data = pd.read_csv(train_file)
print(len(full_data))

242506
Wall time: 3.41 s


In [18]:
def create_seq(sentence, w2v_dict, length):
    res = [w2v_dict[x] for x in sentence if x + 1 < len(w2v_dict)]
    return np.array(res[:length] + [np.zeros_like(w2v_dict[0])] * max(0, length - len(res)))

In [10]:
lens = np.array([len([z for z in x if z < len(full_w2v)]) for x in full_sentences])

In [11]:
np.percentile(lens, [50, 75, 90, 95, 99])

array([ 10.,  14.,  19.,  24.,  33.])

In [21]:
p_SeqLen = 15

In [22]:
%%time
data_x1 = np.array([create_seq(x[0], full_w2v, p_SeqLen) for x in full_seqs])
data_x2 = np.array([create_seq(x[1], full_w2v, p_SeqLen) for x in full_seqs])

Wall time: 1min 6s


In [23]:
data_y = full_data.is_duplicate.values

In [24]:
(train_x1, train_x2, train_y), (valid_x1, valid_x2, valid_y) = modutils.splitSample((data_x1, data_x2, data_y), pcts=[0.7,0.3])

In [36]:
#p_RNN_SIZE = [full_w2v.shape[1]]
p_RNN_SIZE = [10]
p_HID_SIZE = 10

EncoderCell = lambda n: tf.nn.rnn_cell.GRUCell(num_units=n, activation=tf.nn.elu)
    
tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_in_x1 = tf.placeholder(tf.float32, shape=(None, p_SeqLen, full_w2v.shape[1]))
    tf_in_x2 = tf.placeholder(tf.float32, shape=(None, p_SeqLen, full_w2v.shape[1]))
    tf_in_y = tf.placeholder(tf.int32, shape=(None))
    
    tf_full_x = tf.concat([tf_in_x1, tf_in_x2], axis=0)

with tf.name_scope('RNN'):
    rnnEncoderCell = tf.nn.rnn_cell.MultiRNNCell([EncoderCell(s) for s in p_RNN_SIZE], state_is_tuple=True)
    
    _, tf_FinState0 = tf.nn.dynamic_rnn(rnnEncoderCell, inputs=tf_full_x, dtype=tf.float32, time_major=False)
    tf_FinState = tf_FinState0[-1] #get latest layer in RNN

with tf.name_scope('FC'):
    tf_FinState1, tf_FinState2 = tf.split(tf_FinState, 2)
    tf_FinStateC = tf.concat([0.5*(tf_FinState1+tf_FinState2),
                              tf_FinState1*tf_FinState2,
                              tf.squared_difference(tf_FinState1, tf_FinState2)], axis=1)
    tf_logit0 = tf.layers.dense(tf_FinStateC, p_HID_SIZE, activation=tf.nn.elu)
    tf_logit = tf.layers.dense(tf_logit0, 2)
    
with tf.name_scope('Output'):
    tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
    tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

    tf_prob = tf.nn.softmax(tf_logit)
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_A', tf.get_default_graph())
    
print('Graph creation complete.')

Graph creation complete.


In [37]:
num_epochs = 50
num_steps  = 2
batch_size = 2048
valid_dict = {tf_in_x1: valid_x1, tf_in_x2: valid_x2, tf_in_y: valid_y}

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for tX1, tX2, tY in modutils.shuffleBatches((train_x1, train_x2, train_y), batchSize=batch_size):
            train_dict = {tf_in_x1: tX1, tf_in_x2: tX2, tf_in_y: tY}
            tt0 = time.perf_counter()
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            tt1 = time.perf_counter()
            print('{0:.3f} -> {1:.3f}\t{2:.2f} sec'.format(tl0, tl1, tt1-tt0), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(valid_y, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(valid_y, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        
        print('\nEpoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
print('\nDone')

0.549 -> 0.548	2.24 sec
Epoch 0: 0.702 -> 0.540 in 364.02 sec, gini=0.555, accur=0.723
0.520 -> 0.519	2.35 sec
Epoch 1: 0.540 -> 0.521 in 361.67 sec, gini=0.602, accur=0.733
0.492 -> 0.485	2.17 sec
Epoch 2: 0.521 -> 0.498 in 381.93 sec, gini=0.635, accur=0.750
0.489 -> 0.480	2.22 sec
Epoch 3: 0.498 -> 0.498 in 373.73 sec, gini=0.646, accur=0.750
0.495 -> 0.495	2.34 sec
Epoch 4: 0.498 -> 0.488 in 357.16 sec, gini=0.657, accur=0.757
0.483 -> 0.482	1.25 sec sec
Epoch 5: 0.488 -> 0.482 in 35309.83 sec, gini=0.664, accur=0.761
0.498 -> 0.496	2.32 sec
Epoch 6: 0.482 -> 0.479 in 379.38 sec, gini=0.674, accur=0.764
0.465 -> 0.460	2.23 sec
Epoch 7: 0.479 -> 0.468 in 385.19 sec, gini=0.686, accur=0.771
0.428 -> 0.425	2.22 sec
Epoch 8: 0.468 -> 0.466 in 382.35 sec, gini=0.691, accur=0.773
0.456 -> 0.458	2.19 sec
Epoch 9: 0.466 -> 0.464 in 374.08 sec, gini=0.696, accur=0.774
0.434 -> 0.430	2.22 sec
Epoch 10: 0.464 -> 0.458 in 357.68 sec, gini=0.704, accur=0.777
0.434 -> 0.432	2.29 sec
Epoch 11: 0.

KeyboardInterrupt: 

In [None]:
#p_RNN_SIZE = [10]
#p_HID_SIZE = 10
#on epoch 10 - gini 70, on epoch 28 gini 73.5