# Train Word2Vec

In [1]:
from gensim.models import Word2Vec
import logging
import os
import re
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def walk_dir_paths(_dir):
    """Get paths to all files."""
    for root, directories, filenames in os.walk(_dir):
        for filename in filenames:
            yield os.path.join(root,filename)

def gen_sentences(corpus_path):
    """Open paths, return generator of a list of sentences."""
    comp_sent = re.compile(r'\.\s')
    comp_word = re.compile(r'[\W_]+')
    
    paths = walk_dir_paths(corpus_path)
    for path in paths:
        try:
            with open(path) as file_open:
                text = file_open.read()
            sentences = comp_sent.split(text)
        except Exception as e:
            pass
            
        # rm junk
        for sentence in sentences:
            words = comp_word.split(sentence.lower())
            sentence = [w for w in words if w if len(w) > 2]
            if sentence:
                yield sentence

In [3]:
all_enron = os.path.expanduser('~/Downloads/word2vec_stuff/maildir')  # 517401
enron_taylor = os.path.join(all_enron, 'taylor-m')  # 13875
enron_meyers = os.path.join(all_enron, 'meyers-a')  # 1099
enron_small = os.path.join(all_enron, 'taylor-m/uk_trading')  # 20

In [4]:
# what do the sentences look like?
counter = 0
for s in gen_sentences(enron_taylor):
    counter += 1
    if counter % 10000 == 0:
        print(s)

['travel', 'schedule', 'pretty', 'tight', 'this', 'month', 'you', 'think', 'makes', 'sense', 'set', 'something', 'for', 'april', 'enron', 'north', 'america', 'corp']
['bryan', 'can', 'you', 'ask', 'mary', 'arrange', 'meeting', 'for', 'thurs', 'fri']
['alan', 'aronowitz', '2000', 'thomas', 'gros', 'enron', 'enron', 'mark', 'taylor', 'hou', 'ect', 'ect', 'todd', 'ballengee', 'enron', 'enron', 'leslie', 'reeves', 'hou', 'ect', 'ect', 'subject', 'commoditylogic', 'legal', 'docs', 'tom', 'the', 'eol', 'password', 'application', 'form', 'used', 'develop', 'the', 'commoditylogic', 'password', 'application', 'except', 'set', 'forth', 'below', 'identical', 'the', 'latest', 'version', 'the', 'eol', 'available', 'the', 'eol', 'website']
['http', 'www', 'amazon', 'com', 'your', 'account', 'thank', 'you', 'for', 'shopping', 'amazon', 'com', 'amazon', 'com', 'earth', 'biggest', 'selection', 'http', 'www', 'amazon', 'com', 'orders', 'amazon', 'com']
['message', '13841375', '1075860193430', 'javamail'

In [5]:
model = Word2Vec(sentences=list(gen_sentences(enron_meyers)), size=8, window=5, min_count=5, workers=4)

In [6]:
for s in gen_sentences(enron_taylor):
    model.train(s)  # 5.6 min 

339.45538306236267


In [7]:
model.most_similar('friday')

[('hotmail', 0.968381404876709),
 ('infrastructure', 0.9614548683166504),
 ('linda', 0.9553625583648682),
 ('thursday', 0.9542112350463867),
 ('laid', 0.9515721797943115),
 ('original', 0.9272415041923523),
 ('mmeyers', 0.925380289554596),
 ('january', 0.9249365329742432),
 ('address', 0.9219551086425781),
 ('how', 0.9138798117637634)]

In [9]:
model.vocab

{'continue': <gensim.models.word2vec.Vocab at 0x1077a2550>,
 'completed': <gensim.models.word2vec.Vocab at 0x1077a25c0>,
 'inform': <gensim.models.word2vec.Vocab at 0x1077cb1d0>,
 'fsp': <gensim.models.word2vec.Vocab at 0x1077b3940>,
 'moves': <gensim.models.word2vec.Vocab at 0x1077c63c8>,
 'seems': <gensim.models.word2vec.Vocab at 0x1077a25f8>,
 'htrinh': <gensim.models.word2vec.Vocab at 0x1077b39b0>,
 'hsalisbu': <gensim.models.word2vec.Vocab at 0x1077d6198>,
 'forster': <gensim.models.word2vec.Vocab at 0x1077a2630>,
 'bharathi': <gensim.models.word2vec.Vocab at 0x1077b39e8>,
 'brown': <gensim.models.word2vec.Vocab at 0x1077b3a20>,
 'greg': <gensim.models.word2vec.Vocab at 0x1077b3a58>,
 'name': <gensim.models.word2vec.Vocab at 0x1077a2668>,
 'many': <gensim.models.word2vec.Vocab at 0x1077b3ac8>,
 'trading': <gensim.models.word2vec.Vocab at 0x107791160>,
 'guy': <gensim.models.word2vec.Vocab at 0x1077cbeb8>,
 'else': <gensim.models.word2vec.Vocab at 0x1077b57b8>,
 'term': <gensim.mod

In [10]:
model.init_sims(replace=True)  # trim model when done training
model.save(os.path.expanduser('~/Downloads/w2v_enron_meyers_taylor.model'))