In [58]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics

w2v_size = 9000
w2v_file = '../DataSets/Quora/w2v_res_180119.pickle'
train_file = '../DataSets/Quora/train.csv'

In [51]:
%%time
with open(w2v_file, 'rb') as f:
    (full_dict, full_sentences, full_w2v) = pickle.load(f)
    
full_data = pd.read_csv(train_file)
print(len(full_data))

242506
Wall time: 2.62 s


In [49]:
%%time
default_if_na = full_w2v[-1]
full_res = [[full_w2v[i] for i in s if i+1 < w2v_size] for s in full_sentences]
full_res = np.array([np.mean(s, axis=0) if len(s) > 0 else default_if_na for s in full_res])

Wall time: 27 s


In [62]:
def calc_features(s1, s2, vocab_size=1000):
    st1 = set(x for x in s1 if x > vocab_size)
    st2 = set(x for x in s2 if x > vocab_size)
    nAB = len(set.intersection(st1, st2))
    nAUB = len(set.union(st1, st2))
    rAB = nAB / nAUB if nAUB > 0 else 0
    return np.array([nAB, nAUB, rAB])

In [63]:
%%time
full1 = full_sentences[:(len(full_sentences)//2)]
full2 = full_sentences[(len(full_sentences)//2):]
f0 = np.array([calc_features(full1[i], full2[i]) for i in range(len(full1))])

Wall time: 2.78 s


In [64]:
p1 = full_res[:(len(full_res)//2)]
p2 = full_res[(len(full_res)//2):]
f1 = p1 * p2
f2 = np.square(p1-2)
f3 = 0.5*(p1 + p2)
fX = np.hstack([f1,f2,f3,f0])
fY = full_data.is_duplicate.values

In [65]:
(train_X, train_Y), (valid_X, valid_Y) = modutils.splitSample((fX, fY), pcts=[0.7,0.3])

In [66]:
tf.reset_default_graph()

tf_in_x = tf.placeholder(tf.float32, shape=(None, valid_X.shape[1]))
tf_in_y = tf.placeholder(tf.int32, shape=(None))

tf_logit = tf.layers.dense(tf_in_x, 2)
tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

tf_prob = tf.nn.softmax(tf_logit)

In [67]:
num_epochs = 50
num_steps  = 5
batch_size = 2048
valid_dict = {tf_in_x: valid_X, tf_in_y: valid_Y}

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for tX, tY in modutils.shuffleBatches((train_X, train_Y), batchSize=batch_size):
            train_dict = {tf_in_x: tX, tf_in_y: tY}
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            print('{0:.3f} -> {1:.3f}'.format(tl0, tl1), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(valid_Y, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(valid_Y, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        
        print('\nEpoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
print('\nDone')

0.536 -> 0.535
Epoch 0: 0.662 -> 0.541 in 10.77 sec, gini=0.544, accur=0.709
0.598 -> 0.543
Epoch 1: 0.541 -> 0.531 in 10.38 sec, gini=0.570, accur=0.714
0.501 -> 0.500
Epoch 2: 0.531 -> 0.525 in 10.48 sec, gini=0.582, accur=0.719
0.528 -> 0.526
Epoch 3: 0.525 -> 0.518 in 10.49 sec, gini=0.591, accur=0.725
0.535 -> 0.566
Epoch 4: 0.518 -> 0.553 in 10.14 sec, gini=0.597, accur=0.693
0.557 -> 0.526
Epoch 5: 0.553 -> 0.513 in 10.43 sec, gini=0.601, accur=0.728
0.542 -> 0.549
Epoch 6: 0.513 -> 0.524 in 10.39 sec, gini=0.605, accur=0.723
0.531 -> 0.519
Epoch 7: 0.524 -> 0.512 in 10.33 sec, gini=0.608, accur=0.731
0.502 -> 0.515
Epoch 8: 0.512 -> 0.528 in 10.36 sec, gini=0.611, accur=0.720
0.495 -> 0.496
Epoch 9: 0.528 -> 0.509 in 9.71 sec, gini=0.612, accur=0.732
0.533 -> 0.551
Epoch 10: 0.509 -> 0.542 in 10.21 sec, gini=0.614, accur=0.703
0.505 -> 0.503
Epoch 11: 0.542 -> 0.510 in 10.38 sec, gini=0.616, accur=0.729
0.513 -> 0.507
Epoch 12: 0.510 -> 0.514 in 10.40 sec, gini=0.618, accur=0.7

KeyboardInterrupt: 