In [25]:
import gensim
import nltk

In [None]:
nltk.corpus.stopwords

- **Define an iterator that yields a line of the document per iteration**
    - You need an iterator if the document has a lot of lines and you don't want to load them all in at once

In [2]:
class MySentences(object):
    def __init__(self, fname):
        self.fname = fname
    
    def __iter__(self):
        for line in open(self.fname):
            words = line.split()
            word
            yield 

In [3]:
line_iterator = MySentences('raw_sentences.txt')

- **Print the first few lines of the document**

In [4]:
line_iterator2 = MySentences('raw_sentences.txt')
for i, line in enumerate(line_iterator2):
    if i > 3:
        break
    print line

['No', ',', 'he', 'says', 'now', '.']
['And', 'what', 'did', 'he', 'do', '?']
['The', 'money', "'s", 'there', '.']
['That', 'was', 'less', 'than', 'a', 'year', 'ago', '.']


- **Train a word2vec model**
    - `sg=1` means skip-gram is used
    - `workers=4` means use 4 cores
    - `size=100` means the layer is of size 100

In [5]:
%%timeit
model = gensim.models.Word2Vec(sentences=line_iterator, size=100, min_count=1, sg=1, seed=42, workers=4)

1 loops, best of 3: 6.13 s per loop


In [6]:
model = gensim.models.Word2Vec(sentences=line_iterator, size=100, min_count=1, sg=1, seed=42, workers=4)

In [7]:
model.most_similar(positive=['day'])

[('night', 0.7559455633163452),
 ('week', 0.7236725091934204),
 ('game', 0.6129457950592041),
 ('year', 0.609284520149231),
 ('season', 0.6008633971214294),
 ('off', 0.5410240292549133),
 ('May', 0.492523193359375),
 ('On', 0.48334595561027527),
 ('office', 0.4796464145183563),
 ('days', 0.46643078327178955)]

- **Saving the model to a binary file**

In [23]:
from gensim import utils
from six import iteritems
def save_sampled_word_vectors(vocab_syn0_tup, fname, binary):
    vocab, syn0 = vocab_syn0_tup
    assert len(vocab) == syn0.shape[0]
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % syn0.shape))
        # store in sorted order: most frequent words at the top
        for word, vocab in sorted(iteritems(vocab), key=lambda item: -item[1].count):
            row = syn0[vocab.index]
            if binary:
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

In [24]:
save_sampled_word_vectors((model.vocab, model.syn0), 'raw_sentence_word2vec_alt.txt', False)

In [8]:
model.save_word2vec_format('raw_sentence_word2vec.bin.gz', binary=True)

In [12]:
model.save_word2vec_format('raw_sentence_word2vec.txt', binary=False)

In [13]:
!grep day raw_sentence_word2vec.txt

day -0.075769 0.426146 -0.065635 -0.202406 -0.001413 0.223627 -0.115254 -0.055489 0.090441 -0.213343 -0.188934 -0.110624 -0.240913 -0.055284 -0.037003 -0.209487 -0.407531 -0.036872 -0.112005 0.578992 0.116271 0.129044 0.240499 -0.020116 0.061565 -0.079783 0.031827 -0.063596 -0.391882 0.246715 -0.096211 0.495102 0.385679 -0.140752 0.060129 -0.197037 0.017463 -0.087697 0.082446 -0.048656 0.066171 -0.132582 0.410399 -0.169088 0.053762 -0.292791 -0.042287 0.048132 -0.051564 -0.323221 -0.269820 0.170764 -0.155793 -0.158371 0.023382 -0.185978 -0.527142 -0.011020 0.285565 -0.037519 -0.170292 -0.254911 -0.002080 -0.019666 -0.057464 0.026621 0.163694 -0.053668 -0.047269 -0.129903 -0.116750 0.117310 -0.108700 0.356108 0.075504 -0.112164 0.063923 -0.376701 -0.010347 0.059203 -0.161422 0.338747 -0.159119 0.429067 0.242076 0.098549 0.127159 -0.047817 0.200157 -0.198079 -0.309752 0.106777 -0.113941 -0.044359 -0.176312 0.370763 -0.142137 -0.319157 0.375068 0.567709
today 0.045677 -0.205757 0.120596 

In [15]:
model['day']

array([-0.07576926,  0.42614639, -0.06563481, -0.20240588, -0.0014128 ,
        0.22362684, -0.11525359, -0.05548898,  0.0904412 , -0.21334273,
       -0.1889343 , -0.11062394, -0.24091266, -0.05528387, -0.03700271,
       -0.2094871 , -0.40753129, -0.03687177, -0.11200472,  0.57899189,
        0.11627138,  0.12904392,  0.24049881, -0.02011611,  0.06156468,
       -0.0797834 ,  0.03182727, -0.06359574, -0.39188227,  0.24671496,
       -0.09621136,  0.49510199,  0.38567945, -0.14075217,  0.06012853,
       -0.19703694,  0.01746314, -0.08769712,  0.08244622, -0.04865608,
        0.06617107, -0.13258238,  0.41039905, -0.16908839,  0.05376206,
       -0.29279104, -0.04228741,  0.04813201, -0.05156419, -0.32322097,
       -0.26982012,  0.17076375, -0.15579265, -0.15837137,  0.02338156,
       -0.18597765, -0.52714157, -0.01101984,  0.28556529, -0.03751927,
       -0.17029184, -0.25491059, -0.00207992, -0.019666  , -0.05746353,
        0.0266208 ,  0.16369438, -0.05366763, -0.047269  , -0.12