In [1]:
from os import listdir
import numpy as np
from pyemd import emd
import re
from gensim.models.word2vec import Word2Vec

In [2]:
#directory where data is stored
DIR = "speechExtract_v2/data/"

#read in file names as list of labels.
docLabels = []
docLabels = [f for f in listdir(DIR) if f.endswith('.txt')]

In [3]:
#create an array of the files we wish to train on.
data = []
for doc in docLabels:
    with open(DIR + doc, 'r') as d:
        text = d.read()
        text = re.sub("[^a-z'.A-Z]"," ", text)
        data.append(text)

In [4]:
#create list of list of words, each list of words is one sentence from speeches.
sentences = []
for i in range(len(data)):
    sent = data[i].split('.')
    for j in range(len(sent)):
        sentences.append(sent[j].split())
        

In [5]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [7]:
# Initialize and train the model (this will take some time)
print "Training model..."
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
#If you don't plan to train the model any further, calling 
#init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

Training model...


In [8]:
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"

In [9]:
#make and adjacency matrix for the vocabulary of our model using cosine similarity
def make_adj(model):
    words = model.vocab.keys()
    n = len(words)
    adj = np.empty((n,n))
    for i in range(n):
        adj[i][i] = 0.0
        for j in range(n):
            if i != j:
                adj[i][j] = model.similarity(words[i], words[j])
    return adj

In [10]:
adj = make_adj(model)

In [11]:
#recover sentences as single strings
s1 = ''
s2 = ''

for word in sentences[0]:
    s1 += word
    s1 += ' ' 
    
for word in sentences[1]:
    s2 += word
    s2 += ' '

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words="english").fit([s1, s2])
print("Features:",  ", ".join(vect.get_feature_names()))

v_1, v_2 = vect.transform([s1, s2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
print(v_1,v_2)

('Features:', u'american, female, gardens, getting, legendary, lot, neighborhood, page, presidency, projectpromote, ready, spring, starting, things, tomatoes, unidentified')
(array([1, 1, 0, 1, 0, 2, 0, 1, 1, 1, 1, 0, 0, 2, 0, 1]), array([0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1]))


In [13]:
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
adj_n = adj / adj.max()

In [15]:
emd(v_1, v_2, adj_n)

0.6477048471347725