In [4]:
import os 
import tensorflow as tf 
import tensorflow_datasets as tfds 
import gensim.downloader as api
from gensim.models import Word2Vec

# Create Embedding with Text8

In [8]:
info = api.info('text8')
assert(len(info) > 0)

dataset = api.load('text8')
model = Word2Vec(dataset)

model.save('~/.keras/models/text8-word2vec.bin')

# Exploring Text8 Embedding

In [10]:
from gensim.models import KeyedVectors

def print_most_similar(word_conf_pairs, k):
    for i, (word, conf) in enumerate(word_conf_pairs):
        print("{:.3f} {:s}".format(conf, word))
        if i >= k - 1:
            break
    
    if k < len(word_conf_pairs):
        print('...')

In [11]:
model = KeyedVectors.load('~/.keras/models/text8-word2vec.bin')
word_vectors = model.wv 

# get words in the vocabulary
words = word_vectors.vocab.keys()
print([x for i, x in enumerate(words) if i < 10])
assert('king' in words)

print('# words similar to king')
print_most_similar(word_vectors.most_similar('king'), 5)

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
# words similar to king
0.735 prince
0.707 throne
0.706 queen
0.684 kings
0.675 emperor
...


In [13]:
print('# vector arithmetic with words (cosine similarity)')
print('# france + berlin - paris = ?')
print_most_similar(word_vectors.most_similar(positive=['france', 'berlin'], negative=['paris']), 1)

# vector arithmetic with words (cosine similarity)
# france + berlin - paris = ?
0.807 germany
...


In [14]:
print('# find odd one out')
print('# [hindus, parsis, singapore, christians]')
print(word_vectors.doesnt_match(['hindus', 'parsis', 'singapore', 'christians']))

# find odd one out
# [hindus, parsis, singapore, christians]
singapore


In [19]:
print('# similarity between words')
for word in ['woman', 'dog', 'whale', 'tree', 'men', 'man']:
    print('similarity({:s}, {:s}) = {:.3f}'.format('man', word, word_vectors.similarity('man', word)))

# similarity between words
similarity(man, woman) = 0.767
similarity(man, dog) = 0.404
similarity(man, whale) = 0.280
similarity(man, tree) = 0.302
similarity(man, men) = 0.495
similarity(man, man) = 1.000


In [20]:
vec_song = word_vectors['song']
print('\n# output vector obtained directly, shape: ', vec_song.shape)


# output vector obtained directly, shape:  (100,)


In [22]:
vec_song_2 = word_vectors.word_vec('song', use_norm=True)
print('\n# output vector obtained directly, shape: ', vec_song_2.shape)


# output vector obtained directly, shape:  (100,)
