In [1]:
'''Creating a data structure that is easier to work with than the entire word2vec model from google
We identify unique words - stop_words from the Clinton documents, get their word vectors from google's model and save
those vectors in a data_frame

'''

import pandas as pd
from os import listdir
import numpy as np
from pyemd import emd
import re
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import euclidean as euc

In [2]:
DIR = "keywordsExtRAKE/HillarySpeeches/"

#read in file names as list of labels.
docLabels = []
docLabels = [f for f in listdir(DIR) if f.endswith('.txt')]

#create an array of the files we wish to train on.
data = []
for doc in docLabels:
    with open(DIR + doc, 'r') as d:
        text = d.read()
        text = re.sub("[^a-z'.A-Z]"," ", text)
        data.append(text.lower())

# create a list of list of strings, each sublist is a sentence from data, each string is a word in the sentence.
sentences = []
for i in range(len(data)):
    sent = data[i].split('.')
    for j in range(len(sent)):
        sentences.append(sent[j].split())

In [None]:
# load googles word2vec model
wv = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
mystop =[]
with open('SmartStoplist.txt','r') as f:
    for line in f:
        mystop.append(line.strip())

#create a list of key words for WMD to consider
words = []
j = 0
for s in sentences:
    for i in range(len(s)):
        if s[i] not in mystop and s[i] not in words:
            words.append(s[i])

In [None]:
#create a numpy array, each row represents a word vector, if a word is not present in google's model, create a row of zeros
vocab_dict = {}
for w in words:
    try:
        vocab_dict[w] = wv[w]
    except:
        print("{} is not in the model".format{w})

In [None]:
df = pd.DataFrame(vocab_dict)
file_name = "word_vecs.csv"
df.to_csv(file_name)

In [None]:
# Use CountVectorizer to get vectors of two speeches
d1 = data[0]
d2 = data[1]
vect = CountVectorizer(stop_words="english").fit([d1, d2])

In [None]:
# creating a distance matrix for use with these speeches
dis = np.zeros((462,462) )
for i in range(462):
...     for j in range(i+1,462):
...         if vect.get_feature_names()[i] in df.columns and vect.get_feature_names()[j] in df.columns:
...             dis[i,j] = euc(df[vect.get_feature_names()[i]], df[vect.get_feature_names()[j]])
dis = dis + dis.T

In [None]:
# prepare the document vecs to work with pyemd
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()

In [None]:
# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
dis = dis.astype(np.double)
dis /= dis.max()

print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, dis)))