In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time
import sklearn, sklearn.metrics, sklearn.linear_model, sklearn.ensemble

tfidf_file = '../DataSets/Quora/train_tfidf.csv'
rnn_file = '../DataSets/Quora/train_rnn.csv'
train_file = '../DataSets/Quora/train.csv'

In [3]:
%%time
src_train = pd.read_csv(train_file)
rnn_train = pd.read_csv(rnn_file)
tfidf_train = pd.read_csv(tfidf_file)

Wall time: 6.36 s


In [7]:
full_train = src_train[['id', 'question1', 'question2', 'is_duplicate']].copy()

In [9]:
full_train['rnn_prob'] = rnn_train.rnn_prob
full_train['tfidf_prob'] = tfidf_train.tfidf_prob

In [12]:
rnn_features = ['rnn_features_{:02d}'.format(i) for i in range(20)]
tfidf_features = ['tfidf_features_{:02d}'.format(i) for i in range(10)]
wset_features = ['wset_features_{:02d}'.format(i) for i in range(3)]
all_features = rnn_features + tfidf_features + wset_features

In [13]:
for x in rnn_features:
    full_train[x] = rnn_train[x]
for x in tfidf_features+wset_features:
    full_train[x] = tfidf_train[x]

In [19]:
data_train, data_valid, data_test = modutils.splitSample(full_train, pcts=[0.5, 0.3, 0.2])
Xtrain = data_train[all_features].values
Xvalid = data_valid[all_features].values
Xtest = data_test[all_features].values

In [26]:
%%time
fmodel = sklearn.ensemble.GradientBoostingClassifier(min_samples_split=50, n_estimators=50).fit(Xtrain, data_train.is_duplicate.values)

Wall time: 54.1 s


In [30]:
gini_train = sklearn.metrics.roc_auc_score(data_train.is_duplicate.values, fmodel.predict_proba(Xtrain)[:,1])*2-1
gini_valid = sklearn.metrics.roc_auc_score(data_valid.is_duplicate.values, fmodel.predict_proba(Xvalid)[:,1])*2-1
print('Train: {:.3f}, Valid: {:.3f}'.format(gini_train, gini_valid))

Train: 0.850, Valid: 0.847


In [47]:
%%time
fmodel = sklearn.linear_model.LogisticRegression().fit(Xtrain, data_train.is_duplicate.values)

Wall time: 4.01 s


In [32]:
gini_train = sklearn.metrics.roc_auc_score(data_train.is_duplicate.values, fmodel.predict_proba(Xtrain)[:,1])*2-1
gini_valid = sklearn.metrics.roc_auc_score(data_valid.is_duplicate.values, fmodel.predict_proba(Xvalid)[:,1])*2-1
print('Train: {:.3f}, Valid: {:.3f}'.format(gini_train, gini_valid))

Train: 0.855, Valid: 0.857


In [35]:
%%time
fmodel = sklearn.ensemble.RandomForestClassifier(min_samples_split=200, n_estimators=10).fit(Xtrain, data_train.is_duplicate.values)

Wall time: 12.8 s


In [36]:
gini_train = sklearn.metrics.roc_auc_score(data_train.is_duplicate.values, fmodel.predict_proba(Xtrain)[:,1])*2-1
gini_valid = sklearn.metrics.roc_auc_score(data_valid.is_duplicate.values, fmodel.predict_proba(Xvalid)[:,1])*2-1
print('Train: {:.3f}, Valid: {:.3f}'.format(gini_train, gini_valid))

Train: 0.880, Valid: 0.841


In [37]:
%%time
fmodel = sklearn.ensemble.GradientBoostingClassifier(min_samples_split=0.2, n_estimators=50).fit(Xtrain, data_train.is_duplicate.values)

Wall time: 29.9 s


In [38]:
gini_train = sklearn.metrics.roc_auc_score(data_train.is_duplicate.values, fmodel.predict_proba(Xtrain)[:,1])*2-1
gini_valid = sklearn.metrics.roc_auc_score(data_valid.is_duplicate.values, fmodel.predict_proba(Xvalid)[:,1])*2-1

print('Train: {:.3f}, Valid: {:.3f}'.format(gini_train, gini_valid))

Train: 0.847, Valid: 0.844


In [48]:
sklearn.metrics.log_loss(data_valid.is_duplicate.values, fmodel.predict_proba(Xvalid)[:,1])

0.32618728787230672

In [71]:
HID_SIZE = [20]

tf.reset_default_graph()

tf_in_x = tf.placeholder(tf.float32, shape=(None, Xtrain.shape[1]))
tf_in_y = tf.placeholder(tf.int32, shape=(None))
tf_in_training = tf.placeholder_with_default(False, shape=())

tf_hidden = tf_in_x
for sz in HID_SIZE:
    tf_hidden = tf.layers.dense(tf_hidden, sz, activation=tf.nn.elu)
tf_logit = tf.layers.dense(tf_hidden, 2)
tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_logit))
tf_train = tf.train.AdamOptimizer(1e-2).minimize(tf_loss)

tf_prob = tf.nn.softmax(tf_logit)

In [75]:
num_epochs = 10
num_steps  = 2
batch_size = 1024
Yvalid = data_valid.is_duplicate.values
valid_dict = {tf_in_x: Xvalid, tf_in_y: Yvalid}

#tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for n in range(num_epochs):
        t0 = time.perf_counter()
        l0 = tf_loss.eval(feed_dict=valid_dict)
        for bx, by in modutils.shuffleBatches((Xtrain, data_train.is_duplicate.values), batchSize=batch_size):
            train_dict = {tf_in_x: bx, tf_in_y: by, tf_in_training: True}
            tl0 = tf_loss.eval(feed_dict=train_dict)
            for i in range(num_steps):
                tf_train.run(feed_dict=train_dict)
            tl1 = tf_loss.eval(feed_dict=train_dict)
            print('{0:.3f} -> {1:.3f}'.format(tl0, tl1), end='\r')

        valid_p = tf_prob.eval(feed_dict=valid_dict)
        gini = sklearn.metrics.roc_auc_score(Yvalid, valid_p[:,1])*2-1
        accur = sklearn.metrics.accuracy_score(Yvalid, 1*(valid_p[:,1]>0.5))
        l1 = tf_loss.eval(feed_dict=valid_dict)
        t1 = time.perf_counter()
        #p = tfsSaver.save(tfs, '../Models/23Quora08TFIDF_v1/model-{:02d}.ckpt'.format(n))
        #print('\nModel saved at checkpoint: {0}'.format(p))        
        print('\nEpoch {0}: {1:.3f} -> {2:.3f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, l0, l1, t1-t0, gini, accur))
    test_p = tf_prob.eval(feed_dict={tf_in_x: Xtest})[:, 1]
    full_p = tf_prob.eval(feed_dict={tf_in_x: full_train[all_features].values})[:, 1]
    gini = sklearn.metrics.roc_auc_score(data_test.is_duplicate.values, test_p)*2 - 1
    print('Gini on test is {}'.format(gini))
print('\nDone')

0.322 -> 0.322
Epoch 0: 1.476 -> 0.326 in 0.93 sec, gini=0.856, accur=0.849
0.361 -> 0.355
Epoch 1: 0.326 -> 0.322 in 0.89 sec, gini=0.860, accur=0.850
0.321 -> 0.314
Epoch 2: 0.322 -> 0.321 in 0.91 sec, gini=0.861, accur=0.851
0.326 -> 0.321
Epoch 3: 0.321 -> 0.321 in 0.91 sec, gini=0.862, accur=0.851
0.350 -> 0.346
Epoch 4: 0.321 -> 0.319 in 0.90 sec, gini=0.862, accur=0.853
0.333 -> 0.329
Epoch 5: 0.319 -> 0.321 in 0.89 sec, gini=0.863, accur=0.851
0.341 -> 0.338
Epoch 6: 0.321 -> 0.320 in 0.89 sec, gini=0.862, accur=0.852
0.280 -> 0.282
Epoch 7: 0.320 -> 0.319 in 0.87 sec, gini=0.863, accur=0.852
0.335 -> 0.329
Epoch 8: 0.319 -> 0.319 in 0.90 sec, gini=0.864, accur=0.853
0.317 -> 0.312
Epoch 9: 0.319 -> 0.319 in 0.90 sec, gini=0.864, accur=0.853
Gini on test is 0.8628088746235758

Done


In [84]:
full_train[(full_p < 0.01)&(full_train.is_duplicate==1)]

Unnamed: 0,id,question1,question2,is_duplicate,rnn_prob,tfidf_prob,rnn_features_00,rnn_features_01,rnn_features_02,rnn_features_03,...,tfidf_features_03,tfidf_features_04,tfidf_features_05,tfidf_features_06,tfidf_features_07,tfidf_features_08,tfidf_features_09,wset_features_00,wset_features_01,wset_features_02
153,261,How do I choose a journal to publish my paper?,Where do I publish my paper?,1,0.019773,1.656670e-02,0.654659,0.568915,-0.041016,0.681478,...,-0.796005,-0.965818,-0.964944,-0.790711,0.610064,-0.938216,-0.976458,1.0,2.0,0.500000
2370,3989,I am an introvert and I look very young althou...,I'm a petite 32-year-old woman who looks very ...,1,0.173871,9.764432e-03,0.260400,0.294277,0.133852,0.309627,...,-0.196650,-0.675425,-0.819973,-0.085066,1.013045,-0.544096,-0.507437,1.0,14.0,0.071429
9759,16221,"What is the meaning of the Hindi word ""Gaandu”...","What is the meaning of word ""gandoo""?",1,0.013310,3.482355e-03,0.717907,0.414372,-0.243774,0.983385,...,-0.070473,-0.789585,-0.434269,-0.954743,-0.950331,-0.931616,2.577526,0.0,3.0,0.000000
9785,16262,"I am unable to talk to girls, leave being frie...",I am 23. I am shy and unable to talk to any gi...,1,0.186935,4.489033e-03,0.519993,0.158772,-0.077625,0.321429,...,-0.731055,-0.829145,-0.788733,-0.557552,1.041006,-0.855152,-0.881234,1.0,6.0,0.166667
13124,21784,Do CS SRM Ramapuram students get good placemen...,What are the placements and packages offered a...,1,0.001586,1.872249e-02,1.235631,1.716604,-0.046645,1.577129,...,-0.241604,0.025496,-0.178534,-0.541561,0.221222,-0.509371,0.372475,3.0,8.0,0.375000
14415,23994,Once I have found a picture on Pinterest and I...,How do I get in touch with the right person ab...,1,0.026114,1.437947e-01,0.605272,0.756290,0.114059,0.742216,...,-0.558897,0.179516,-0.808164,0.228736,0.377611,-0.092721,-0.772732,0.0,6.0,0.000000
15974,26532,What happens with the crores of donation money...,What happens to large number of gold and money...,1,0.001730,1.976665e-01,1.253506,1.365340,-0.037681,1.317445,...,-0.623127,-0.916037,1.727674,-0.680032,-0.352423,-0.739701,-0.520133,1.0,8.0,0.125000
17456,28988,What is engineering management?,What is engineering managment?,1,0.139643,1.333882e-04,0.277332,0.085829,0.030147,0.209446,...,0.156887,-0.984150,-0.355666,-0.876650,-0.999930,-0.946708,4.826382,0.0,1.0,0.000000
17974,29815,Why do Americans fear number 13?,Why are Americans frightened of the number 13?...,1,0.348741,1.884015e-03,0.223085,0.157697,0.054519,0.158991,...,-0.769923,-0.958823,-0.732575,-0.997961,1.047097,-0.959786,-0.763052,1.0,8.0,0.125000
20226,33573,"Where can I get cheap, but decent quality clot...",What are the best place for men to shop in Delhi?,1,0.014983,7.049020e-02,0.996923,0.813759,0.065375,0.655927,...,-0.881452,-0.801505,-0.763817,-0.798458,-0.300033,-0.783288,-0.118743,0.0,6.0,0.000000


In [91]:
full_train[(full_train.rnn_prob < 0.1) & (full_train.tfidf_prob > 0.9)]

Unnamed: 0,id,question1,question2,is_duplicate,rnn_prob,tfidf_prob,rnn_features_00,rnn_features_01,rnn_features_02,rnn_features_03,...,tfidf_features_03,tfidf_features_04,tfidf_features_05,tfidf_features_06,tfidf_features_07,tfidf_features_08,tfidf_features_09,wset_features_00,wset_features_01,wset_features_02
4611,7620,What should be done after persuing b. Tech in ...,What are the job prospects after masters in el...,0,0.043764,0.904880,0.670937,0.394717,-0.041930,0.223677,...,-0.514317,-0.900069,-0.027069,1.689777,-0.811713,0.237304,-0.604681,0.0,8.0,0.000000
4820,7962,Which book is best for mechanism?,Which book is the best book for static mechanics?,0,0.086372,0.936515,0.358935,0.266765,-0.031433,0.301181,...,-0.462082,-0.739884,1.200375,-0.977545,-0.998410,-0.517909,-0.973752,0.0,3.0,0.000000
6487,10688,I lost my Jio barcode how can I regenerate?,Jio bar code lost re generated?,1,0.059748,0.996641,0.652359,0.090483,-0.079630,0.095031,...,3.199735,-0.934586,1.554844,0.054821,-0.837917,0.334583,-0.757991,0.0,5.0,0.000000
8775,14541,What age is considered too old to get a PhD?,Is 32 too old to start a PhD program?,1,0.018830,0.972648,0.644460,0.509592,0.097960,0.586370,...,0.322808,3.018138,-0.314344,0.849579,-0.969648,-0.996508,-0.966800,0.0,1.0,0.000000
10056,16681,How the diode works?,How do PN junction diodes work?,0,0.050979,0.915744,0.295484,0.399704,-0.006056,0.387410,...,2.682654,2.215121,-0.987172,-0.761547,-0.792171,-0.996275,-0.996995,0.0,4.0,0.000000
10147,16832,Why was Winston Churchill never tried for war ...,Why weren't the Allies tried as possible war c...,0,0.098877,0.935639,0.279241,-0.017072,-0.017067,-0.172617,...,0.381820,-0.083087,0.199642,-0.578784,-0.672856,0.434637,-0.930498,2.0,9.0,0.222222
13038,21622,Why do so many People say Trump is going to ca...,"If Donald Trump becomes president, could World...",1,0.067947,0.978664,0.302523,0.066890,-0.205552,-0.043855,...,0.144818,-0.917301,1.068438,-0.772837,-0.663499,1.393986,-0.990774,1.0,2.0,0.500000
14313,23818,Which is the best phone within 35000 presently?,What is the best phone to buy below 35K?,1,0.055992,0.966872,0.496689,0.308688,-0.174718,0.430040,...,-0.250294,-0.997998,2.753558,-0.886330,-0.880680,0.155975,-0.745227,0.0,4.0,0.000000
16495,27410,What was the reason behind the Bhopal gas trag...,Has the Bhopal gas tragedy been solved?,0,0.050422,0.947316,0.325813,0.205290,-0.109599,0.154759,...,1.460076,1.099117,-0.901066,-0.989131,-0.860780,-0.974219,-0.970060,3.0,4.0,0.750000
17578,29175,"I have a drug test at 3 next tuesday, its frid...","Can I pass a urine drug screen test on Monday,...",1,0.076864,0.924417,0.444189,0.047185,-0.015869,-0.149709,...,-0.397368,0.918556,-0.771262,0.004017,-0.753771,0.233753,-0.870098,2.0,8.0,0.250000
