In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics

src_file = '../DataSets/Quora/tfidf_src_180124.pickle'
label_file = '../DataSets/Quora/train.csv'

In [2]:
%%time
with open(src_file, 'rb') as f:
    (src_data, src_vocab_size) = pickle.load(f)

Wall time: 4.86 s


In [3]:
%%time
src_full = pd.read_csv(label_file)
print(len(src_full))

242506
Wall time: 891 ms


In [4]:
src_target = src_full.is_duplicate.values

In [5]:
(train_data, train_target), (valid_data, valid_target) = modutils.splitSample((src_data, src_target), pcts=[0.7,0.3])

In [38]:
def make_dense(values, size):
    res = np.zeros(shape=(size,))
    for x in values:
        if x[0] < size:
            res[x[0]] = x[1]
    return res

def make_features(src, vocab_size=100):
    p1 = np.array([make_dense(x[0], size=vocab_size) for x in src])
    p2 = np.array([make_dense(x[1], size=vocab_size) for x in src])
    f1 = p1 * p2
    f2 = np.square(p1-p2)
    f3 = 0.5 * (p1 + p2)
    f4 = np.array([x[2] for x in src])
    return np.hstack([f1,f2,f3])

In [39]:
%%time
src_vocab_size = 1000 #override vocab size
valid_set = make_features(valid_data, vocab_size=src_vocab_size)
valid_trg = np.array(valid_target).reshape(-1)

Wall time: 12 s


In [40]:
tf.reset_default_graph()

tf_in_x = tf.placeholder(tf.float32, shape=(None, valid_set.shape[1]))
tf_in_y = tf.placeholder(tf.int32, shape=(None))

tf_hidden = tf.layers.dense(tf.layers.dropout(tf_in_x), 10, activation=tf.nn.elu)
tf_logit = tf.layers.dense(tf_hidden, 2)
tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

tf_prob = tf.nn.softmax(tf_logit)

In [41]:
num_epochs = 10
num_steps  = 2
batch_size = 1024
valid_dict = {tf_in_x: valid_set, tf_in_y: valid_trg}

tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for bx, by in modutils.shuffleBatches((train_data, train_target), batchSize=batch_size):
            train_x = make_features(bx, vocab_size=src_vocab_size)
            train_y = np.array(by).reshape(-1)
            train_dict = {tf_in_x: train_x, tf_in_y: train_y}
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            print('{0:.3f} -> {1:.3f}'.format(tl0, tl1), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(valid_trg, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(valid_trg, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        p = tfsSaver.save(tfs, '../Models/23Quora08TFIDF_v1/model-{:02d}.ckpt'.format(n))
        print('\nModel saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
print('\nDone')

0.411 -> 0.355
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-00.ckpt
Epoch 0: 0.694 -> 0.482 in 37.99 sec, gini=0.667, accur=0.757
0.591 -> 0.522
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-01.ckpt
Epoch 1: 0.482 -> 0.469 in 40.08 sec, gini=0.685, accur=0.764
0.451 -> 0.387
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-02.ckpt
Epoch 2: 0.469 -> 0.466 in 40.08 sec, gini=0.693, accur=0.766
0.282 -> 0.244
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-03.ckpt
Epoch 3: 0.466 -> 0.457 in 39.49 sec, gini=0.704, accur=0.772
0.478 -> 0.395
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-04.ckpt
Epoch 4: 0.457 -> 0.456 in 39.31 sec, gini=0.709, accur=0.774
0.449 -> 0.369
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-05.ckpt
Epoch 5: 0.456 -> 0.457 in 39.53 sec, gini=0.715, accur=0.772
0.398 -> 0.328
Model saved at checkpoint: ../Models/23Quora08TFIDF_v1/model-06.ckpt
Epoch 6: 0.457 -> 0.453 in 39.82 sec, gi

### Results
1. tfidf-1000 => 72.1% gini, 78.0% accuracy
2. tfidf-300 => 66.4% gini, 75.3% accuracy
3. tfidf-100 => 61.9% gini, 73.5% accuracy
4. virtual-tfidf-5000 => 

In [None]:
#tf-idf-1000 w word-set-features, no hidden
    #=> gini on Epoch 4 is 71.2 (already overfitting), gini on Epoch 9 is 71.2
#tf-idf-1000 w/o word-set-features, no hidden
    #=> gini on Epoch 4 is 65.5 (already overfitting), gini on Epoch 9 is 65.7
#tf-idf-1000 only word-set-features, no hidden
    #=> gini on Epoch 4 is 48.9 (no overfitting), gini on Epoch 9 is 49.3

#tf-idf-1000 only word-set-features, with hidden layer of 20
    #=> gini on Epoch 4 is 50.1, gini on Epoch 9 is 50.2
#tf-idf-1000 w/o word-set-features, with hidden layer of 20
    #=> gini on Epoch 4 is 71.4 (already overfitting), gini on Epoch 9 is 72.2
#tf-idf-1000 w word-set-features, with hidden layer of 20
    #=> gini on Epoch 4 is 77.1 (already overfitting), gini on Epoch 9 is 77.2
    
#tf-idf-1000 full, with hidden layer of 40&do.5 and 10&do.5
    #=> gini on Epoch 4 is 77.8? (already overfitting), gini on Epoch 9 is ?

In [48]:
data_prob = None
data_features = None
with tf.Session() as tfs:
    tfsSaver.restore(tfs, '../Models/23Quora08TFIDF_v1/model-{:02d}.ckpt'.format(7))
    batch_size = 1024
    cur_offset = 0
    while cur_offset < len(src_data):
        batch_dict = {tf_in_x: make_features(src_data[cur_offset:(cur_offset+batch_size)], vocab_size=src_vocab_size)}
        cur_offset += batch_size
        [tmp_p, tmp_features] = tfs.run([tf_prob, tf_hidden], feed_dict=batch_dict)
        if data_prob is None:
            data_prob = tmp_p[:, 1]
            data_features = tmp_features
        else:
            data_prob = np.hstack([data_prob, tmp_p[:, 1]])
            data_features = np.vstack([data_features, tmp_features])
        print('{}/{}   '.format(cur_offset, len(src_data)), end='\r')
print('\nDone')

INFO:tensorflow:Restoring parameters from ../Models/23Quora08TFIDF_v1/model-07.ckpt
242688/242506   
Done


In [60]:
wset_features = np.array([x[2] for x in src_data])

In [62]:
src_full['tfidf_prob'] = data_prob
for k in range(data_features.shape[1]):
    src_full['tfidf_features_{:02d}'.format(k)] = data_features[:, k]
for k in range(wset_features.shape[1]):
    src_full['wset_features_{:02d}'.format(k)] = wset_features[:, k]

In [57]:
data_features.shape

(242506, 10)

In [64]:
src_full.to_csv('../DataSets/Quora/train_tfidf.csv',
                 columns=['id', 'tfidf_prob']
                + ['tfidf_features_{:02d}'.format(k) for k in range(data_features.shape[1])]
                + ['wset_features_{:02d}'.format(k) for k in range(wset_features.shape[1])],
                 index=False)

In [61]:
wset_features.shape

(242506, 3)