In [58]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics

src_file = '../DataSets/Quora/tfidf_src_180124.pickle'
label_file = '../DataSets/Quora/train.csv'

In [19]:
%%time
with open(src_file, 'rb') as f:
    (src_data, src_vocab_size) = pickle.load(f)

In [44]:
%%time
src_full = pd.read_csv(label_file)
print(len(src_full))

242506
Wall time: 1.4 s


In [46]:
src_target = src_full.is_duplicate.values

In [48]:
(train_data, train_target), (valid_data, valid_target) = modutils.splitSample((src_data, src_target), pcts=[0.7,0.3])

In [63]:
def make_dense(values, size):
    res = np.zeros(shape=(size,))
    for x in values:
        if x[0] < size:
            res[x[0]] = x[1]
    return res

def make_features(src, vocab_size=100):
    p1 = np.array([make_dense(x[0], size=vocab_size) for x in src])
    p2 = np.array([make_dense(x[1], size=vocab_size) for x in src])
    f1 = p1 * p2
    f2 = np.square(p1-p2)
    f3 = 0.5 * (p1 + p2)
    return np.hstack([f1, f2, f3, np.array([x[2] for x in src])])

In [68]:
%%time
src_vocab_size = 1000 #override vocab size
valid_set = make_features(valid_data, vocab_size=src_vocab_size)
valid_trg = np.array(valid_target).reshape(-1)

Wall time: 2.82 s


In [69]:
tf.reset_default_graph()

tf_in_x = tf.placeholder(tf.float32, shape=(None, valid_set.shape[1]))
tf_in_y = tf.placeholder(tf.int32, shape=(None))

tf_logit = tf.layers.dense(tf_in_x, 2)
tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

tf_prob = tf.nn.softmax(tf_logit)

In [70]:
num_epochs = 10
num_steps  = 2
batch_size = 1024
valid_dict = {tf_in_x: valid_set, tf_in_y: valid_trg}

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for bx, by in modutils.shuffleBatches((train_data, train_target), batchSize=batch_size):
            train_x = make_features(bx, vocab_size=src_vocab_size)
            train_y = np.array(by).reshape(-1)
            train_dict = {tf_in_x: train_x, tf_in_y: train_y}
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            print('{0:.3f} -> {1:.3f}'.format(tl0, tl1), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(valid_trg, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(valid_trg, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        
        print('\nEpoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
print('\nDone')

0.501 -> 0.499
Epoch 0: 0.772 -> 0.487 in 13.36 sec, gini=0.656, accur=0.748
0.492 -> 0.489
Epoch 1: 0.487 -> 0.486 in 13.80 sec, gini=0.658, accur=0.750
0.465 -> 0.463
Epoch 2: 0.486 -> 0.482 in 13.63 sec, gini=0.662, accur=0.751
0.471 -> 0.469
Epoch 3: 0.482 -> 0.482 in 13.76 sec, gini=0.663, accur=0.753
0.474 -> 0.473
Epoch 4: 0.482 -> 0.483 in 14.08 sec, gini=0.664, accur=0.754
0.473 -> 0.471
Epoch 5: 0.483 -> 0.482 in 13.76 sec, gini=0.663, accur=0.751
0.472 -> 0.470
Epoch 6: 0.482 -> 0.483 in 13.81 sec, gini=0.664, accur=0.751
0.471 -> 0.467
Epoch 7: 0.483 -> 0.482 in 13.82 sec, gini=0.664, accur=0.754
0.487 -> 0.484
Epoch 8: 0.482 -> 0.481 in 13.94 sec, gini=0.665, accur=0.754
0.494 -> 0.490
Epoch 9: 0.481 -> 0.481 in 13.69 sec, gini=0.664, accur=0.753

Done


### Results
1. tfidf-1000 => 72.1% gini, 78.0% accuracy
2. tfidf-300 => 66.4% gini, 75.3% accuracy
3. tfidf-100 => 61.9% gini, 73.5% accuracy
4. virtual-tfidf-5000 => 