# Imports

In [1]:
import spacy
from spacy.tokens.doc import Doc
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier

In [2]:
nlp = spacy.load('en')

In [3]:
test_loc = 'test_dump.bin'
train_loc = 'train_dump.bin'

# Read data

In [4]:
train = pd.read_csv('../data/train.csv').fillna('')
test = pd.read_csv('../data/test.csv').fillna('')
print train.shape, test.shape

(404290, 6) (2345796, 3)


In [5]:
target = list(train['is_duplicate'])

# Read training data
Elements $(2n, 2n+1)$ are the questions for pair $n$

In [6]:
docs = []
i = 0
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(train_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        if i%10000 == 0: print i, datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        docs.append(Doc(nlp.vocab).from_bytes(byte_string))
        i += 1
#         if i == 10: break
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(type(docs[0]))

2017-05-14 15:39:00
0 2017-05-14 15:39:00
10000 2017-05-14 15:39:03
20000 2017-05-14 15:39:04
30000 2017-05-14 15:39:05
40000 2017-05-14 15:39:06
50000 2017-05-14 15:39:07
60000 2017-05-14 15:39:08
70000 2017-05-14 15:39:09
80000 2017-05-14 15:39:11
90000 2017-05-14 15:39:13
100000 2017-05-14 15:39:14
110000 2017-05-14 15:39:15
120000 2017-05-14 15:39:18
130000 2017-05-14 15:39:19
140000 2017-05-14 15:39:21
150000 2017-05-14 15:39:22
160000 2017-05-14 15:39:24
170000 2017-05-14 15:39:24
180000 2017-05-14 15:39:25
190000 2017-05-14 15:39:26
200000 2017-05-14 15:39:27
210000 2017-05-14 15:39:28
220000 2017-05-14 15:39:29
230000 2017-05-14 15:39:31
240000 2017-05-14 15:39:32
250000 2017-05-14 15:39:33
260000 2017-05-14 15:39:35
270000 2017-05-14 15:39:36
280000 2017-05-14 15:39:37
290000 2017-05-14 15:39:38
300000 2017-05-14 15:39:40
310000 2017-05-14 15:39:41
320000 2017-05-14 15:39:43
330000 2017-05-14 15:39:44
340000 2017-05-14 15:39:45
350000 2017-05-14 15:39:46
360000 2017-05-14 15:3

In [7]:
docs[5].ents

(DNS,)

In [8]:
def covertPropn(pos):
    if pos == u'PROPN':
        return u'NOUN'
    else: return pos

In [9]:
def countQuestionPairs(docs):
    fvs = []
    for i in range(0,len(docs),2):
        if i % 100000 == 0: print i
        fv = {}
        d1 = docs[i]
        d2 = docs[i+1]

        s1 = set(list(str(e) for e in d1.ents))
        s2 = set(list(str(e) for e in d2.ents))

        sameents = len(s1 & s2)
        diffents = len(s1 ^ s2)
        fv['sameents'] = sameents
        fv['diffents'] = diffents

        s1 = set([(covertPropn(word.pos_), word.lemma_) for word in d1 if not word.is_stop])
        s2 = set([(covertPropn(word.pos_), word.lemma_ ) for word in d2 if not word.is_stop])
        diff = s1 ^ s2
        same =  s1 & s2

        samenum = len(same)
        same = Counter([t[0] for t in same])
        fv.update(same)
        fv['samenum'] = samenum

        diffnum = len(diff)
        diff = Counter([t[0] + '_d' for t in diff])
        fv['diffnum'] = diffnum
        fv.update(diff)

    #     print(d1, d2)
        fvs.append(fv)
    return fvs


In [10]:
fvs = countQuestionPairs(docs)

0
100000
200000
300000
400000
500000
600000
700000
800000


In [11]:
for i, f in enumerate(fvs[:10]):
    print f
    print docs[2 *i], docs[2 *i + 1]

{'sameents': 0, u'NOUN': 4, 'samenum': 6, 'diffents': 0, 'diffnum': 1, u'PUNCT': 1, u'VERB': 1, u'NOUN_d': 1}
What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market?
{'sameents': 2, u'NOUN': 4, 'samenum': 8, u'ADJ_d': 1, u'VERB_d': 2, 'diffents': 1, 'diffnum': 5, u'PUNCT': 4, u'NOUN_d': 2}
What is the story of Kohinoor (Koh-i-Noor) Diamond? What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
{'sameents': 0, u'PRON_d': 1, u'NOUN': 1, 'samenum': 3, u'VERB_d': 1, 'diffents': 2, 'diffnum': 6, u'PUNCT': 1, u'VERB': 1, u'NOUN_d': 4}
How can I increase the speed of my internet connection while using a VPN? How can Internet speed be increased by hacking through DNS?
{'sameents': 0, u'ADV_d': 1, 'samenum': 1, u'ADJ_d': 1, u'VERB_d': 3, u'PUNCT_d': 2, u'NUM_d': 1, 'diffents': 1, 'diffnum': 10, u'PUNCT': 1, u'NOUN_d': 2}
Why am I mentally very lonely? How can I solve it? Find the remainder wh

In [12]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
X = v.fit_transform(fvs)

In [13]:
n_estimators = 20
clfs = [RandomForestClassifier(min_samples_leaf=20, min_samples_split=10, n_jobs=-1), MultinomialNB(), SGDClassifier(loss='log'), LogisticRegression()]
# clfs = [SVC(kernel='linear', probability=True)] 

for clf in clfs:
    print clf
    scores = cross_val_score(clf, X, target, cv=3, scoring='neg_log_loss')
    print "Log Loss: %0.5f" % scores.mean()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Log Loss: -0.49835
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Log Loss: -0.96641
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
Log Loss: -0.55215
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='li

In [None]:
n_estimators = 20
clfs = [RandomForestClassifier(n_estimators= 50, min_samples_leaf=7, min_samples_split=30, n_jobs=-1, warm_start=True)]
# clfs = [SVC(kernel='linear', probability=True)] 

for clf in clfs:
    print clf
    scores = cross_val_score(clf, X, target, cv=3, scoring='neg_log_loss')
    print "Log Loss: %0.5f" % scores.mean()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=7,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=True)
Log Loss: -0.49019


# Read test questions

In [None]:
test_docs = []
i = 0
with open(test_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        test_docs.append(Doc(nlp.vocab).from_bytes(byte_string))
        i += 1
print(type(test_docs[0]))

In [None]:
test_docs[1345288]

In [None]:
test_fvs = countQuestionPairs(test_docs)