# Evaluate new Word2Vec thread

In [1]:
import numpy as np
import spacy as sy
import pandas as pd
nlp = sy.load('en')
import textacy as ty

In [2]:
test_loc = 'test_dump.bin'
train_loc = 'train_dump.bin'
from datetime import datetime

from spacy.tokens.doc import Doc
from nltk.corpus import stopwords
stop = stopwords.words('english')


def clean(doc):
    """Removes stops from the nlp objects"""
    b = [x for x in doc if not x.is_punct]
    a = [x.lemma_ if not x.lemma_ == '-PRON-' else x.text for x in b ]
    return nlp(u''.join([x + ' ' for x in a if x not in stop]))


test_docs = []
train_docs = []
i = 0
#print datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(train_loc, 'rb') as file_:
    for byte_string in Doc.read_bytes(file_):
        if i%100000 == 0: print i, datetime.now().strftime('%H:%M:%S')
        train_docs.append(clean(Doc(nlp.vocab).from_bytes(byte_string)))
        i += 1

0 11:37:27
100000 11:38:18
200000 11:39:03
300000 11:39:47
400000 11:40:31
500000 11:41:14
600000 11:42:00
700000 11:42:47
800000 11:43:35


## Custom Word Movers
two sentences and for each word in sent1 find the nearest word vec in sent2. Take that distance and sum over all words in sent1 and then words in sent 2

In [3]:
def my_wmd(doc1,doc2):
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a   
    return t_sum

In [None]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': my_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('kfwmd.csv')

### Entities Word Mover

In [4]:
def ent_wmd(doc1,doc2):
    t_sum = 0.0
    for x in doc1.ents:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2.ents:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a   
    return t_sum   

In [None]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': ent_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('entwmd.csv') 

### Unique word mover

In [5]:
def unique_wmd(d1,d2):
    unique_doc1 = [x.text for x in d1 if x not in d2]
    unique_doc2 = [x.text for x in d2 if x not in d1]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a   
    return t_sum

In [None]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': vn_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('unqwmd.csv')

### Verbs and Nouns Word Mover

In [6]:
def vn_wmd(d1,d2):
    looking = ['NOUN','VERB','PRON','PROPN']
    unique_doc1 = [x.text for x in d1 if x.pos_ in looking]
    unique_doc2 = [x.text for x in d2 if x.pos_ in looking]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a
    for x in doc2:
        a = x.vector
        min_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                min_a = dist
        t_sum = t_sum + min_a   
    return t_sum


In [None]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    d.append({ 'kfwmd': vn_wmd(q1,q2) })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('vnwmd.csv')

## POS features
- Verb
- Noun
- PROPN
- PROPN + NOUN

In [11]:
def pos_wmd(d1,d2,check):
    looking = check
    unique_doc1 = [x.text for x in d1 if x.pos_ in looking]
    unique_doc2 = [x.text for x in d2 if x.pos_ in looking]
    doc1 = nlp(u''.join(unique_doc1))
    doc2 = nlp(u''.join(unique_doc2))
    t_sum = 0.0
    for x in doc1:
        a = x.vector
        min_a = np.inf
        for y in doc2:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < min_a:
                max_a = dist
        t_sum = t_sum + max_a
    for x in doc2:
        a = x.vector
        max_a = np.inf
        for y in doc1:
            b = y.vector
            dist = np.linalg.norm(a-b)    
            if dist < max_a:
                max_a = dist
        t_sum = t_sum + max_a   
    return t_sum

In [15]:
d = []
ct = 0
for q1,q2 in [(train_docs[2*i],train_docs[2*i+1]) for i in range(len(train_docs)/2)]:
    # either cos, euclid or l2
    d.append({ 
        'noun': pos_wmd(q1, q2, ["NOUN"]),
        'verb': pos_wmd(q1, q2, ["VERB"]),
        'prop': pos_wmd(q1, q2, ["PROPN"]), 
        'pron': pos_wmd(q1, q2, ["NOUN","PROPN"])
    })
    if ct % 50000 == 0: print ct, datetime.now().strftime('%H:%M:%S')
    ct = ct+1
features = pd.DataFrame(d)
print "yay"
features.to_csv('pos_wmd.csv')

0 11:56:59
50000 11:57:41
100000 11:58:23
150000 11:59:05
200000 11:59:48
250000 12:00:31
300000 12:01:13
350000 12:01:55
400000 12:02:37
yay
