In [1]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

# Get the word vector for given word
model['topic']
#> array([ 0.0512,  0.2555,  0.9393, ... ,-0.5669,  0.6737], dtype=float32)

model.wv.most_similar('topic')
#> [('discussion', 0.7590423822402954),
#>  ('consensus', 0.7253159284591675),
#>  ('discussions', 0.7252693176269531),
#>  ('interpretation', 0.7196053266525269),
#>  ('viewpoint', 0.7053568959236145),
#>  ('speculation', 0.7021505832672119),
#>  ('discourse', 0.7001898884773254),
#>  ('opinions', 0.6993060111999512),
#>  ('focus', 0.6959210634231567),
#>  ('scholarly', 0.6884037256240845)]

# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')



We have trained and saved a Word2Vec model for our document. However, when a new dataset comes, you want to update the model so as to account for new words.

In [7]:
# Update the model with new data.
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)
model['topic']
# array([-0.6482, -0.5468,  1.0688,  0.82  , ... , -0.8411,  0.3974], dtype=float32)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


array([ 1.22637856e+00,  1.64273947e-01,  1.68680465e+00, -1.25953648e-02,
        1.32304585e+00,  6.36845887e-01,  1.26943934e+00, -3.14511001e-01,
        3.82891089e-01, -8.78210187e-01,  6.61116600e-01,  1.64003468e+00,
       -5.38034618e-01, -1.13696325e+00,  5.80287576e-01,  2.14783072e+00,
        2.96406895e-01,  5.31061828e-01, -2.16134548e-01, -5.19048035e-01,
       -1.93767071e-01, -5.28440952e-01, -4.06028777e-02, -1.18609929e+00,
        1.07600462e+00, -1.91383019e-01, -1.28334895e-01,  8.73369515e-01,
        5.59210293e-02,  1.33039963e+00,  7.11525744e-03,  3.79143238e-01,
       -1.22011542e+00, -1.01336217e+00,  9.41247344e-02, -8.72300506e-01,
        8.39870647e-02,  1.70967773e-01, -6.06927991e-01,  1.23837292e+00,
        1.95503819e+00, -9.15693343e-02,  1.22758114e+00, -2.14937091e+00,
        3.81040648e-02,  8.46357822e-01,  7.06874505e-02,  2.33334884e-01,
        1.04944420e+00,  9.67311978e-01,  9.15687323e-01, -9.23784554e-01,
        1.44439912e+00, -

### 16. How to extract word vectors using pre-trained Word2Vec and FastText models?
We just saw how to get the word vectors for Word2Vec model we just trained. However, gensim lets you download state of the art pretrained models through the downloader API. Let’s see how to extract the word vectors from a couple of these models.

In [None]:
import gensim.downloader as api

# Download the models
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
word2vec_model300 = api.load('word2vec-google-news-300')
glove_model300 = api.load('glove-wiki-gigaword-300')

# Get word embeddings
word2vec_model300.most_similar('support')



In [None]:
# Word2ec_accuracy
word2vec_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.7401448525607863

# fasttext_accuracy
fasttext_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.8827876424099353

# GloVe accuracy
glove_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]
#> 0.7195422354510931