# Evaluate new Word2Vec thread

In [20]:
import numpy as np
import spacy as sy
import pandas as pd
nlp = sy.load('en')

maxt_ = 5000.0

In [2]:
test_loc = 'test_dump.bin'
train_loc = 'train_dump.bin'
from datetime import datetime

from spacy.tokens.doc import Doc
from nltk.corpus import stopwords
stop = stopwords.words('english')


def clean(doc):
    """Removes stops from the nlp objects"""
    b = [x for x in doc if not x.is_punct]
    a = [x.lemma_ if not x.lemma_ == '-PRON-' else x.text for x in b ]
    return nlp(u''.join([x + ' ' for x in a if x not in stop]))


test_docs = []
train_docs = []
i = 0
#print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(train_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        if i%100000 == 0: print i, datetime.now().strftime('%H:%M:%S')
        train_docs.append(clean(Doc(nlp.vocab).from_bytes(byte_string)))
        i += 1

0 16:40:43
100000 16:41:35
200000 16:42:24
300000 16:43:08
400000 16:43:50
500000 16:44:35
600000 16:45:21
700000 16:46:06
800000 16:46:53


## Custom Word Movers
two sentences and for each word in sent1 find the nearest word vec in sent2. Take that distance and sum over all words in sent1 and then words in sent 2

In [22]:
def my_wmd(doc1,doc2):
    touched = False
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a  
    if touched:
        return t_sum
    else:
        return maxt_

In [29]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': my_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('kfwmd_1.csv')

0 17:00:08
50000 17:01:08
100000 17:02:06
150000 17:03:02
200000 17:04:01
250000 17:05:01
300000 17:05:57
350000 17:06:57
400000 17:07:54
yay


### Entities Word Mover

In [21]:
def ent_wmd(doc1,doc2):
    touched = False
    t_sum = 0.0
    for x in doc1.ents:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2.ents:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a   
    if touched:
        return t_sum
    else:
        return maxt_

In [30]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': ent_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('entwmd_1.csv') 

0 17:08:02
50000 17:08:06
100000 17:08:09
150000 17:08:13
200000 17:08:17
250000 17:08:20
300000 17:08:24
350000 17:08:27
400000 17:08:30
yay


### Unique word mover

In [23]:
def unique_wmd(d1,d2):
    touched = False
    unique_doc1 = [x.text for x in d1 if x not in d2]
    unique_doc2 = [x.text for x in d2 if x not in d1]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a   
    if touched:
        return t_sum
    else:
        return maxt_

In [32]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': unique_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('unqwmd_1.csv')

0 17:10:22
50000 17:10:43
100000 17:11:01
150000 17:11:19
200000 17:11:37
250000 17:11:54
300000 17:12:12
350000 17:12:30
400000 17:12:48
yay


### Verbs and Nouns Word Mover

In [33]:
# ******** RUN CELL BELOW FIRST************
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': pos_wmd(q1,q2,['VERB','NOUN','PROPN',"PRON"]) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('vnwmd_1.csv')

0 17:12:58
50000 17:13:20
100000 17:13:40
150000 17:14:01
200000 17:14:21
250000 17:14:41
300000 17:15:00
350000 17:15:19
400000 17:15:39
yay


## POS features
- Verb
- Noun
- PROPN
- PROPN + NOUN

In [25]:
def pos_wmd(d1,d2,check):
    looking = check
    touched = False
    unique_doc1 = [x.text for x in d1 if x.pos_ in looking]
    unique_doc2 = [x.text for x in d2 if x.pos_ in looking]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                touched = True
                min_a = dist
        t_sum = t_sum + min_a   
    if touched:
        return t_sum
    else:
        return maxt_

In [15]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    # either cos, euclid or l2
    d.append({ 
        'noun': pos_wmd(q1, q2, ["NOUN"]),
        'verb': pos_wmd(q1, q2, ["VERB"]),
        'prop': pos_wmd(q1, q2, ["PROPN"]), 
        'pron': pos_wmd(q1, q2, ["NOUN","PROPN"])
    })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('pos_wmd.csv')

0 11:56:59
50000 11:57:41
100000 11:58:23
150000 11:59:05
200000 11:59:48
250000 12:00:31
300000 12:01:13
350000 12:01:55
400000 12:02:37
yay


In [28]:
def summation(doc1,doc2):
    sum_a = np.zeros(300,)
    sum_b = np.zeros(300,)
    for x in doc1:
        sum_a = sum_a + x.vector
    for x in doc2:
        sum_b = sum_b + x.vector 
    if len(doc1)>0 and len(doc2)>0:
        return np.linalg.norm(sum_a - sum_b)   
    else:
        return maxt_/2
    
def pos_sum(d1,d2,looking):
    touched = False
    unique_doc1 = [x.text for x in d1 if x.pos_ in looking]
    unique_doc2 = [x.text for x in d2 if x.pos_ in looking]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    sum_a = np.zeros(300,)
    sum_b = np.zeros(300,)
    for x in doc1:
        sum_a = sum_a + x.vector
    for x in doc2:
        sum_b = sum_b + x.vector 
    if len(doc1)>0 and len(doc2)>0:
        return np.linalg.norm(sum_a - sum_b)   
    else:
        return maxt_/2

In [27]:
print summation(train_docs[0], train_docs[1])
print (train_docs[0], train_docs[1])
print my_wmd(train_docs[100], train_docs[101])

0.0
(step step guide invest share market india , step step guide invest share market )
18.7169675827


In [34]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    # either cos, euclid or l2
    d.append({ 
        'nwmd': pos_wmd(q1, q2, ["NOUN"]),
        'vwmd': pos_wmd(q1, q2, ["VERB"]),
        'pwmd': pos_wmd(q1, q2, ["PROPN"]), 
        'pnwmd': pos_wmd(q1, q2, ["NOUN","PROPN"]),
        'nsum': pos_sum(q1, q2, ["NOUN"]),
        'vsum': pos_sum(q1, q2, ["VERB"]),
        'psum': pos_sum(q1, q2, ["PROPN"]), 
        'psum': pos_sum(q1, q2, ["NOUN","PROPN"]),
    })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('pos.csv')

0 17:15:42
50000 17:17:02
100000 17:18:21
150000 17:19:40
200000 17:20:59
250000 17:22:18
300000 17:23:37
350000 17:24:57
400000 17:26:16
yay
