In [3]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]

model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=1)
model.save("models/word2vec.model")

In [4]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)


(0, 2)

In [5]:
vector = model.wv['computer']
sims = model.wv.most_similar('computer', topn=10)  # get other similar words
#print(vector)
print(sims)

[('system', 0.21617142856121063), ('survey', 0.044689204543828964), ('interface', 0.015203374437987804), ('time', 0.0019510634010657668), ('trees', -0.03284314647316933), ('human', -0.0742427185177803), ('response', -0.09324456751346588), ('graph', -0.09575346112251282), ('eps', -0.10513807088136673), ('user', -0.16909335553646088)]


In [6]:
from gensim.models import KeyedVectors

# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

# Load .
wv = KeyedVectors.load("word2vec.wordvectors")
vector = wv['computer']  # Get numpy vector of a word
print(vector)

[-0.00515774 -0.00667028 -0.0077791   0.00831315 -0.00198292 -0.00685696
 -0.0041556   0.00514562 -0.00286997 -0.00375075  0.0016219  -0.0027771
 -0.00158482  0.0010748  -0.00297881  0.00852176  0.00391207 -0.00996176
  0.00626142 -0.00675622  0.00076966  0.00440552 -0.00510486 -0.00211128
  0.00809783 -0.00424503 -0.00763848  0.00926061 -0.00215612 -0.00472081
  0.00857329  0.00428458  0.0043261   0.00928722 -0.00845554  0.00525685
  0.00203994  0.0041895   0.00169839  0.00446543  0.00448759  0.0061063
 -0.00320303 -0.00457706 -0.00042664  0.00253447 -0.00326412  0.00605948
  0.00415534  0.00776685  0.00257002  0.00811904 -0.00138761  0.00808028
  0.0037181  -0.00804967 -0.00393476 -0.0024726   0.00489447 -0.00087241
 -0.00283173  0.00783599  0.00932561 -0.0016154  -0.00516075 -0.00470313
 -0.00484746 -0.00960562  0.00137242 -0.00422615  0.00252744  0.00561612
 -0.00406709 -0.00959937  0.00154715 -0.00670207  0.0024959  -0.00378173
  0.00708048  0.00064041  0.00356198 -0.00273993 -0.0

In [7]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [50]:
# Download the "glove-twitter-25" embeddings
model = gensim.downloader.load('word2vec-google-news-300')

In [51]:
model.most_similar('twitter')

[('Twitter', 0.8908904194831848),
 ('Twitter.com', 0.7536780834197998),
 ('tweet', 0.7431626319885254),
 ('tweeting', 0.7161932587623596),
 ('tweeted', 0.7137226462364197),
 ('facebook', 0.6988551616668701),
 ('tweets', 0.6974530816078186),
 ('Tweeted', 0.6950210928916931),
 ('Tweet', 0.6875007152557373),
 ('Tweeting', 0.6845167279243469)]

In [52]:
model.most_similar('obama')

[('mccain', 0.7319012880325317),
 ('hillary', 0.7284600138664246),
 ('obamas', 0.7229632139205933),
 ('george_bush', 0.7205674648284912),
 ('barack_obama', 0.7045838832855225),
 ('palin', 0.7043113708496094),
 ('clinton', 0.6934447884559631),
 ('clintons', 0.6816835403442383),
 ('sarah_palin', 0.6815143823623657),
 ('john_mccain', 0.6800707578659058)]

<img src="analogia.png">

In [53]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print(result[0])

('queen', 0.7118193507194519)


In [26]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [27]:
analogy('good', 'fantastic', 'bad')

'horrible'

In [30]:
analogy('japan', 'france', 'japanese')

'french'

In [31]:
analogy('japan', 'german', 'japanese')

'swedish'

In [32]:
analogy('japan', 'canada', 'japanese')

'canadian'

In [60]:
# Which one is the odd one out in this list?
model.doesnt_match(["cat","dog","france"])

'france'