In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics

w2v_size = 9000
w2v_file = '../DataSets/Quora/w2v_res_180119.pickle'
train_file = '../DataSets/Quora/train.csv'

In [2]:
%%time
with open(w2v_file, 'rb') as f:
    (full_dict, full_sentences, full_w2v) = pickle.load(f)
    
full_seqs = list(zip(full_sentences[:(len(full_sentences)//2)], full_sentences[(len(full_sentences)//2):]))
    
full_data = pd.read_csv(train_file)
print(len(full_data))

242506
Wall time: 3.75 s


In [3]:
def create_seq_raw(sentence, w2v_dict, length):
    res = [w2v_dict[x] for x in sentence if x + 1 < len(w2v_dict)]
    return np.array(res[:length] + [np.zeros_like(w2v_dict[0])] * max(0, length - len(res)))

def create_seq(sentence, dict_size, length):
    res = [x + 1 for x in sentence if x + 1 < dict_size]
    return np.array(res[:length] + [0] * max(0, length - len(res)))

In [4]:
lens = np.array([len([z for z in x if z < len(full_w2v)]) for x in full_sentences])

In [5]:
np.percentile(lens, [50, 75, 90, 95, 99])

array([ 10.,  14.,  19.,  24.,  33.])

In [12]:
p_SeqLen = 25

In [13]:
%%time
data_x1 = np.array([create_seq(x[0], len(full_w2v), p_SeqLen) for x in full_seqs])
data_x2 = np.array([create_seq(x[1], len(full_w2v), p_SeqLen) for x in full_seqs])

real_embedinng = np.vstack([np.zeros_like(full_w2v[0]), full_w2v])

Wall time: 3.57 s


In [14]:
data_y = full_data.is_duplicate.values

In [15]:
(train_x1, train_x2, train_y), (valid_x1, valid_x2, valid_y) = modutils.splitSample((data_x1, data_x2, data_y), pcts=[0.7,0.3])

In [24]:
#p_RNN_SIZE = [full_w2v.shape[1]]
p_RNN_SIZE = [60]
p_HID_SIZE = [20]

EncoderCell = lambda n: tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_units=n, activation=tf.nn.elu),
                                                      output_keep_prob=0.5)
    
tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_embedding = tf.constant(real_embedinng)
    tf_in_x1 = tf.placeholder(tf.int32, shape=(None, p_SeqLen))
    tf_in_x2 = tf.placeholder(tf.int32, shape=(None, p_SeqLen))
    tf_in_x1r = tf.nn.embedding_lookup(tf_embedding, tf_in_x1)
    tf_in_x2r = tf.nn.embedding_lookup(tf_embedding, tf_in_x2)
    tf_in_y = tf.placeholder(tf.int32, shape=(None))
    
    tf_full_x = tf.concat([tf_in_x1r, tf_in_x2r], axis=0)

with tf.name_scope('RNN'):
    rnnEncoderCell = tf.nn.rnn_cell.MultiRNNCell([EncoderCell(s) for s in p_RNN_SIZE], state_is_tuple=True)
    
    _, tf_FinState0 = tf.nn.dynamic_rnn(rnnEncoderCell, inputs=tf_full_x, dtype=tf.float32, time_major=False)
    tf_FinState = tf_FinState0[-1] #get latest layer in RNN

with tf.name_scope('FC'):
    tf_FinState1, tf_FinState2 = tf.split(tf_FinState, 2)
    tf_FinStateC = tf.concat([0.5*(tf_FinState1+tf_FinState2),
                              tf_FinState1*tf_FinState2,
                              tf.squared_difference(tf_FinState1, tf_FinState2)], axis=1)
    tf_hid_input = tf_FinStateC
    for sz in p_HID_SIZE:
        tf_hid_input = tf.layers.dense(tf.layers.dropout(tf_hid_input), sz, activation=tf.nn.elu)
    tf_logit = tf.layers.dense(tf_hid_input, 2)
    
with tf.name_scope('Output'):
    tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
    tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

    tf_prob = tf.nn.softmax(tf_logit)
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_A', tf.get_default_graph())
    
print('Graph creation complete.')

Graph creation complete.


In [29]:
num_epochs = 50
num_steps  = 2
batch_size = 2048
valid_dict = {tf_in_x1: valid_x1, tf_in_x2: valid_x2, tf_in_y: valid_y}

tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for tX1, tX2, tY in modutils.shuffleBatches((train_x1, train_x2, train_y), batchSize=batch_size):
            train_dict = {tf_in_x1: tX1, tf_in_x2: tX2, tf_in_y: tY}
            tt0 = time.perf_counter()
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            tt1 = time.perf_counter()
            print('{0:.3f} -> {1:.3f}\t{2:.2f} sec'.format(tl0, tl1, tt1-tt0), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(valid_y, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(valid_y, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        
        p = tfsSaver.save(tfs, '../Models/23Quora08RNN_v1/model-{:02d}.ckpt'.format(n))
        print('\nModel saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
print('\nDone')

0.503 -> 0.492	1.49 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-00.ckpt
Epoch 0: 0.697 -> 0.524 in 248.01 sec, gini=0.592, accur=0.732
0.467 -> 0.458	1.56 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-01.ckpt
Epoch 1: 0.524 -> 0.476 in 253.23 sec, gini=0.675, accur=0.766
0.443 -> 0.429	1.55 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-02.ckpt
Epoch 2: 0.476 -> 0.447 in 256.84 sec, gini=0.719, accur=0.784
0.420 -> 0.411	1.54 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-03.ckpt
Epoch 3: 0.447 -> 0.434 in 260.19 sec, gini=0.737, accur=0.792
0.409 -> 0.398	1.45 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-04.ckpt
Epoch 4: 0.434 -> 0.422 in 255.21 sec, gini=0.754, accur=0.798
0.408 -> 0.392	1.56 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1/model-05.ckpt
Epoch 5: 0.422 -> 0.416 in 254.15 sec, gini=0.762, accur=0.804
0.353 -> 0.336	1.48 sec
Model saved at checkpoint: ../Models/23Quora08RNN_v1

KeyboardInterrupt: 

In [None]:
#p_SeqLen = 15
#p_RNN_SIZE = [10]
#p_HID_SIZE = 10
#on epoch 10 - gini 70, on epoch 28 gini 73.5

#p_SeqLen = 15
#p_RNN_SIZE = [20]
#p_HID_SIZE = 20
#on epoch 10 - gini 73, on epoch 28 gini 75

#p_SeqLen = 20
#p_RNN_SIZE = [20]
#p_HID_SIZE = 20
#on epoch 10 - gini 75, on epoch 28 gini 76

#p_SeqLen = 20
#p_RNN_SIZE = [60] with 0.5 dropout
#p_HID_SIZE = 60
#on epoch 10 - gini 78, on epoch 28 gini ??

#p_SeqLen = 20
#p_RNN_SIZE = [50, 50] with 0.5 dropout
#p_HID_SIZE = 50
#on epoch 10 - gini 76, on epoch 28 gini ??

#p_SeqLen = 25
#p_RNN_SIZE = [200] with 0.25 dropout
#p_HID_SIZE = [100, 20] with 0.5 dropout
#on epoch 8 - gini 78, on epoch 9 gini 0

#p_SeqLen = 25
#p_RNN_SIZE = [40] with 0.5 dropout
#p_HID_SIZE = [60, 20] with 0.5 dropout
#on epoch 10 - gini 77.7, on epoch 28 gini 0

#p_SeqLen = 25
#p_RNN_SIZE = [60] with 0.5 dropout
#p_HID_SIZE = [20] with 0.5 dropout
#on epoch 10 - gini 77.7, on epoch 28 gini 0

In [10]:
create_seq([1, 4], 1000, 10)

array([ 2.,  5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])