# Imports

In [1]:
import spacy
from spacy.tokens.doc import Doc
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import nltk

In [2]:
nlp = spacy.load('en')

In [3]:
test_loc = 'test_dump.bin'
train_loc = 'train_dump.bin'
invalids = ['PUNCT', 'X', 'EOL', 'SPACE']

# Read data

In [4]:
train = pd.read_csv('../data/train.csv').fillna('')
test = pd.read_csv('../data/test.csv').fillna('')
print train.shape, test.shape

(404290, 6) (2345796, 3)


In [5]:
target = list(train['is_duplicate'])

# Read training data
Elements $(2n, 2n+1)$ are the questions for pair $n$

In [6]:
docs = []
i = 0
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(train_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        if i%10000 == 0: print i, datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        docs.append(Doc(nlp.vocab).from_bytes(byte_string))
        i += 1
#         if i == 10: break
print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(type(docs[0]))

2017-05-16 03:35:02
0 2017-05-16 03:35:02
10000 2017-05-16 03:35:06
20000 2017-05-16 03:35:07
30000 2017-05-16 03:35:08
40000 2017-05-16 03:35:09
50000 2017-05-16 03:35:10
60000 2017-05-16 03:35:11
70000 2017-05-16 03:35:12
80000 2017-05-16 03:35:13
90000 2017-05-16 03:35:14
100000 2017-05-16 03:35:16
110000 2017-05-16 03:35:17
120000 2017-05-16 03:35:18
130000 2017-05-16 03:35:19
140000 2017-05-16 03:35:20
150000 2017-05-16 03:35:21
160000 2017-05-16 03:35:23
170000 2017-05-16 03:35:24
180000 2017-05-16 03:35:24
190000 2017-05-16 03:35:26
200000 2017-05-16 03:35:26
210000 2017-05-16 03:35:27
220000 2017-05-16 03:35:28
230000 2017-05-16 03:35:30
240000 2017-05-16 03:35:31
250000 2017-05-16 03:35:33
260000 2017-05-16 03:35:34
270000 2017-05-16 03:35:34
280000 2017-05-16 03:35:35
290000 2017-05-16 03:35:36
300000 2017-05-16 03:35:37
310000 2017-05-16 03:35:38
320000 2017-05-16 03:35:39
330000 2017-05-16 03:35:40
340000 2017-05-16 03:35:41
350000 2017-05-16 03:35:42
360000 2017-05-16 03:3

In [7]:
docs[5].ents

(DNS,)

In [8]:
def covertPropn(pos):
#     if pos == u'PROPN' or pos == u'PRON':
#         return u'NOUN'
#     if pos == u'ADP': return u'ADV'
#     else: return pos
    return pos

In [9]:
from nltk.tag import mapping
from itertools import product
from nltk.corpus import wordnet as wn
import numpy as np
from nltk.wsd import lesk
import copy

def sim(s1, s2):
    return wn.wup_similarity(s1, s2)
    
def leskify(lemma, pos, sentence):
    mapping = {'NOUN': 'n', 'ADJ':'a', 'ADV':'r', 'VERB':'v', 'PRON':'n', 'ADP':'p', 'DET':'n'}
    m = mapping[pos]
    l = lesk(sentence, lemma, m)
    return l
    
def wnSim(w1, w2):
    fillin = 0
    w1 = set([(word.lemma_, covertPropn(word.pos_)) for word in w1])
    w2 = set([(word.lemma_, covertPropn(word.pos_)) for word in w2])
    sentence1 = w1 - w2
    sentence2 = w2 - w1
    if len(sentence1) < len(sentence2): 
        t = set(sentence2)
        sentence2 = sentence1
        sentence1 = t
    sentence = w1 | w2
    if len(sentence1) == 0 or len(sentence2) == 0: return fillin
    allsyns1 = set(leskify(w[0], w[1], sentence) for w in sentence1)
    allsyns1 = set([a for a in allsyns1 if a])
    if len(allsyns1) == 0: return fillin
    ysets = [leskify(w[0], w[1], sentence) for w in sentence2]
    ysets = [y for y in ysets if y]
    if len(ysets) == 0: return fillin
    maxes = [max((sim(s1, s2) or 0) for s1, s2 in product(allsyns1, [y])) for y in ysets if y]
    return np.mean(maxes)

In [10]:
d1 = nlp(u"I ran fast to the store to buy eggs")
d2 = nlp(u'He ran after waffles quickly and happily inside the shop')
w1 = [w for w in d1 if not w.is_stop and w.pos is not 95]
w2 = [w for w in d2 if not w.is_stop and w.pos is not 95]
wnSim(w1, w2)

0.15257352941176472

In [11]:
def prepareDoc(doc):
    return [w.lemma_ for w in doc if not w.is_stop and w.pos not in invalids]

In [12]:
unigramCounts = Counter([d for doc in docs for d in prepareDoc(doc)])
bigramCounts = Counter([d for doc in docs for d in nltk.bigrams(prepareDoc(doc))])
trigramCounts = Counter([d for doc in docs for d in nltk.ngrams(prepareDoc(doc), 3)])

In [13]:
def tfidf(w, corpus):
    if w in corpus and corpus[w] is not 0: return 1/float(corpus[w])
    else: return 0
def getOverlaps(sentence1, sentence2):
    s1 = set(sentence1)
    s2 = set(sentence2)
    return s1 & s2, s1 ^ s2, s1 | s2
def tupleCounter(l):
    d = defaultdict(int) 
    for t in l:
        d[t[0]] += t[1]
    return d

In [14]:
def countQuestionPairs(docs):
    
    fvs = []
    for i in range(0,len(docs),2):
        if i % 100000 == 0: print i
        fv = defaultdict(int) 
        d1 = docs[i]
        d2 = docs[i+1]

        s1 = set(list(str(e).lower() for e in d1.ents))
        s2 = set(list(str(e).lower() for e in d2.ents))
        
        w1 = [w for w in d1 if not w.is_stop and w.pos not in invalids]
        w2 = [w for w in d2 if not w.is_stop and w.pos not in invalids]
        
        p1 = [(covertPropn(word.pos_), word.lemma_) for word in w1]
        p2 = [(covertPropn(word.pos_), word.lemma_) for word in w2]
        
        u1 = [word.lemma_ for word in w1]
        u2 = [word.lemma_ for word in w2]
        
        total = float(len(u1) + len(u2))
        sameents = s1 & s2
        diffents = s1 ^ s2
        if len(sameents) > 0:
            fv['sameents'] = np.sum([tfidf(w, unigramCounts) for w in sameents])
        if len(diffents) > 0:
            fv['diffents'] = np.sum([tfidf(w, unigramCounts) for w in diffents])
        
        psame, pdiff, ptotal = getOverlaps(p1, p2)
        usame, udiff, utotal = getOverlaps(u1, u2)
        bsame, bdiff, btotal = getOverlaps(nltk.bigrams(u1), nltk.bigrams(u2))        
        tsame, tdiff, ttotal = getOverlaps(nltk.ngrams(u1,3),nltk.ngrams(u2,3))
#         fv['total'] = total
        fv['usame'] = len(usame)
        fv['udiff'] = len(udiff)
        if len(usame) > 0:
            fv['weightedsame'] =  np.sum([tfidf(w, unigramCounts) for w in usame])
        if len(udiff) > 0:
            fv['weighteddiff'] =  np.sum([tfidf(w, unigramCounts) for w in udiff])

        fv.update((tupleCounter([(t[0], tfidf(t[1], unigramCounts)) for t in psame])))
        fv.update((tupleCounter([(t[0] + '_d', tfidf(t[1], unigramCounts)) for t in pdiff])))     
        fv['bsame'] = np.sum([tfidf(w, bigramCounts) for w in bsame])
        fv['tsame'] = np.sum([tfidf(w, trigramCounts) for w in tsame])

#         fv['wn'] = wnSim(w1, w2)
#         if i > 100: return fvs
#         print fv
        fvs.append(fv)
    return fvs

In [15]:
fvs = countQuestionPairs(docs)

0
100000
200000
300000
400000
500000
600000
700000
800000


In [16]:
def getParts(d, parts):
    out = {}
    keys = d.keys()
    for p in parts:
        if p in keys: out[p] = d[p]
    return out

In [17]:
set(fvs[0].keys()) | set(fvs[1].keys()) | set(fvs[2].keys()) |set(fvs[3].keys())

{u'ADJ_d',
 u'ADV_d',
 u'NOUN',
 u'NOUN_d',
 u'NUM_d',
 u'PRON_d',
 u'PROPN',
 u'PROPN_d',
 u'PUNCT',
 u'PUNCT_d',
 u'VERB',
 u'VERB_d',
 'bsame',
 'diffents',
 'sameents',
 'tsame',
 'udiff',
 'usame',
 'weighteddiff',
 'weightedsame'}

In [18]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
X = v.fit_transform(fvs)

In [21]:
clf = RandomForestClassifier(n_estimators= 50, min_samples_leaf=1, min_samples_split=8, n_jobs=-1, warm_start=True)
scores = cross_val_score(clf, X, target, cv=3, scoring='neg_log_loss')
print "Log Loss: %0.5f" % scores.mean()

Log Loss: -0.40956


In [20]:
clfs = [MultinomialNB(), SGDClassifier(loss='log'), LogisticRegression(), RandomForestClassifier(n_estimators= 50, min_samples_leaf=7, min_samples_split=30, n_jobs=-1, warm_start=True), LinearDiscriminantAnalysis(), GradientBoostingClassifier()]
# clfs = [SVC(kernel='linear', probability=True)] 

for clf in clfs:
    print clf
    scores = cross_val_score(clf, X, target, cv=3, scoring='neg_log_loss')
    print "Log Loss: %0.5f" % scores.mean()

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Log Loss: -0.68284
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
Log Loss: -0.61792
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Log Loss: -0.54409
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=7,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, 

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from itertools import cycle

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=.2)
clfs = [MultinomialNB(), SGDClassifier(loss='log'), LogisticRegression(), RandomForestClassifier(n_estimators= 50, min_samples_leaf=7, min_samples_split=30, n_jobs=-1, warm_start=True), LinearDiscriminantAnalysis(), GradientBoostingClassifier()]
clfnames  = ['RandForest', 'MNB', 'SGD','LogReg', 'RandForest', 'LDA', 'GradientBoost']
colors = cycle(['darkorange','green', 'red', 'blue', 'k', 'p'])

for (clfname, clf, color) in zip(clfnames, clfs, colors):
    y_score = zip(*clf.fit(X_train, y_train).predict_proba(X_test))[1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    rocauc = auc(fpr, tpr)
    lw = 2
    plt.plot(fpr, tpr, color=color, lw=lw, label=clfname + ' (area = %0.2f)' % rocauc)
    
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Word Count Method')
plt.legend(loc="lower right")
plt.savefig("roc_wc.pdf")
plt.show()

# Read test questions

In [None]:
test_docs = []
i = 0
with open(test_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        test_docs.append(Doc(nlp.vocab).from_bytes(byte_string))
        i += 1
print(type(test_docs[0]))

In [None]:
test_docs[1345288]

In [None]:
test_fvs = countQuestionPairs(test_docs)