In [86]:
import gensim
from gensim import utils
import numpy as np
from six import iteritems
import copy
import random

- **You can load in word vectors trained by Google News documents**
    - The file is 1.5GB and is not included in the repo, but you can download it
    - Note that download and loading it in will take a while (Loading will take ~5 mins)

In [4]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

--2015-08-06 16:25:39--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com... 54.231.10.24
Connecting to s3.amazonaws.com|54.231.10.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: 'GoogleNews-vectors-negative300.bin.gz'


2015-08-06 16:36:54 (2.33 MB/s) - 'GoogleNews-vectors-negative300.bin.gz' saved [1647046227/1647046227]



In [12]:
model = gensim.models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

- **You can query the most similar words to the list of words you are interested in**

In [18]:
model.most_similar(positive=['shoe'])

[(u'shoes', 0.7552986145019531),
 (u'footwear', 0.6774437427520752),
 (u'sneaker', 0.6722689270973206),
 (u'Shoe', 0.6581934690475464),
 (u'sandal', 0.629105806350708),
 (u'sneakers', 0.6061371564865112),
 (u'slipper', 0.5994482040405273),
 (u'sandals', 0.5673254728317261),
 (u'Shoes', 0.559785783290863),
 (u'sock', 0.5576707720756531)]

- **You can query the most similar words to the list of words you are interested in**
    - And exclude words related to concepts you are not interested in
    - `sandals` are exlcude when `beach` is included as a negative

In [19]:
model.most_similar(positive=['shoe'], negative=['beach'])

[(u'Shoe', 0.4694807231426239),
 (u'shoes', 0.43661928176879883),
 (u'retailer_Footstar', 0.389981210231781),
 (u'orthotics', 0.3796594440937042),
 (u'footwear', 0.3790792226791382),
 (u'sneaker', 0.3684130609035492),
 (u'athletic_footwear', 0.36705636978149414),
 (u'Shoes', 0.36504432559013367),
 (u'Aokang_Group', 0.35710006952285767),
 (u'shoemaker', 0.3485073745250702)]

- **All the word vectors are stored in the attribute `syn0`**
    - There are `3,000,000` words, each represented by `300` dimensions 

In [21]:
model.syn0.shape

(3000000, 300)

- **All the words in the model are stored in the attribute `index2word`**

In [26]:
model.index2word[:10]

[u'</s>',
 u'in',
 u'for',
 u'that',
 u'is',
 u'on',
 u'##',
 u'The',
 u'with',
 u'said']

- **You can access the vector of the word with indexing retrieval**

In [29]:
print model['said'].shape
print model['said'][:10]

(300,)
[-0.00453209 -0.0220217   0.04964008 -0.03796006 -0.02822671  0.03066005
  0.12750687 -0.07884012  0.00827335 -0.04818008]


- **The model as it stands is too big, so I am going to subsample the word vectors and write it to new files**
    - I am going to pick `30`, `3000`, `300000` and give myself the freedom to decide how long I want to wait
    - There is no built-in to sample a word2vec model, so I am going to implement one here
    - Note it is not memory-optimized and takes around 5 mins to sample 

In [79]:
def sample_word_vectors(model, n):
    sampled_vocab = dict(random.sample(model.vocab.items(), n))
    syn0 = model.syn0
    counter = 0
    new_vocab = {}
    new_syn0_lst = []
    for word, vocab in sampled_vocab.iteritems():
        vocab_copy = copy.deepcopy(vocab)
        old_index = vocab.index
        vocab_copy.index = counter
        assert vocab_copy.index != old_index
        counter += 1
        new_vocab[word] = vocab_copy
        new_syn0_lst.append(syn0[old_index])
    return new_vocab, np.array(new_syn0_lst)

In [80]:
sampled_30_tup = sample_word_vectors(model, 30)
sampled_3000_tup = sample_word_vectors(model, 3000)
sampled_300000_tup = sample_word_vectors(model, 300000)

In [87]:
def save_sampled_word_vectors(vocab_syn0_tup, fname, binary):
    vocab, syn0 = vocab_syn0_tup
    assert len(vocab) == syn0.shape[0]
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % syn0.shape))
        # store in sorted order: most frequent words at the top
        for word, vocab in sorted(iteritems(vocab), key=lambda item: -item[1].count):
            row = syn0[vocab.index]
            if binary:
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

In [88]:
save_sampled_word_vectors(sampled_30_tup, 'data/google_news_30.bin.gz', True)
save_sampled_word_vectors(sampled_3000_tup, 'data/google_news_3000.bin.gz', True)
save_sampled_word_vectors(sampled_300000_tup, 'data/google_news_300000.bin.gz', True)

In [90]:
save_sampled_word_vectors(sampled_30_tup, 'data/google_news_30.txt', False)
save_sampled_word_vectors(sampled_3000_tup, 'data/google_news_3000.txt', False)
save_sampled_word_vectors(sampled_300000_tup, 'data/google_news_300000.txt', False)

- **Test if the subsampled models can be loaded back**

In [92]:
model_30_from_binary = gensim.models.word2vec.Word2Vec.load_word2vec_format('data/google_news_30.bin.gz', binary=True)
model_30_from_txt = gensim.models.word2vec.Word2Vec.load_word2vec_format('data/google_news_30.txt', binary=False)

In [93]:
model_30_from_binary.syn0.shape == (30, 300)

True

In [94]:
len(model_30_from_binary.vocab) == 30

True

In [97]:
random_word = model_30_from_binary.vocab.iterkeys().next()
all(model_30_from_binary[random_word] == model[random_word])

True