#### source: https://github.com/adventuresinML/adventures-in-ml-code/blob/master/gensim_word2vec.py

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install --upgrade gensim

In [1]:
import logging
import collections
import os
import zipfile

import numpy as np
import gensim
import tensorflow as tf

from gensim.models import word2vec
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing  import sequence
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Reshape, GlobalAveragePooling1D
from tensorflow.keras.models import Model

from urllib import request

vector_dim = 300

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = f.read(f.namelist()[0]).split()
    return data

def convert_data_to_index(string_data, wv):
    index_data = []
    for word in string_data:
        if word in wv:
            index_data.append(wv.vocab[word].index)
    return index_data

def collect_data():
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    if not os.path.exists(filename.strip('.zip')):
        zipfile.ZipFile(filename).extractall()
    return filename

In [2]:
def train(model_name="word2vec-gensim.model"):
    filename = collect_data()
    
    sentences = word2vec.Text8Corpus(filename.strip('.zip'))
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = word2vec.Word2Vec(sentences, iter=10, min_count=10, size=300, workers=4)

    str_data = read_data(filename)
    index_data = convert_data_to_index(str_data, model.wv)
    print(str_data[:4], index_data[:4])
    
    model.save(model_name)
    return model

In [9]:
def create_embedding_matrix(model):
    # convert the wv word vectors into a numpy matrix that is suitable for insertion
    # into our TensorFlow and Keras models
    embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def keras_model(embedding_matrix, wv):
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    
    # input words - in this case we do sample by sample evaluations of the similarity
    valid_word = Input((1,), dtype='int32')
    other_word = Input((1,), dtype='int32')
    
    # setup the embedding layer
    embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], weights=[embedding_matrix])
    embedded_a = embeddings(valid_word)
    embedded_b = embeddings(other_word)
    similarity = Dot(name="Cosine-Similarity", axes=2, normalize=True)([embedded_a, embedded_b])

    # create the Keras model
    k_model = Model(inputs=[valid_word, other_word], outputs=similarity)

    def get_similarity(valid_word_idx, vocab_size):
        similarities = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = k_model.predict_on_batch([in_arr1, in_arr2])
            similarities[i] = out
        return similarities

    # now run the model and get the closest words to the valid examples
    for i in range(valid_size):
        valid_word = wv.index2word[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        similarity = get_similarity(valid_examples[i], len(wv.vocab))
        nearest = (-similarity).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in range(top_k):
            close_word = wv.index2word[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
        print(log_str)

def tensorflow_model(embedding_matrix, wv):
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # embedding layer weights are frozen to avoid updating embeddings while training
    saved_embeddings = tf.constant(embedding_matrix)
    embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)

    # create the cosine similarity operations
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
    normalized_embeddings = embedding / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # call our similarity operation
    sim = similarity.numpy()
    
    # run through each valid example, finding closest words
    for i in range(valid_size):
        valid_word = wv.index2word[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in range(top_k):
            close_word = wv.index2word[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
        print(log_str)

In [11]:
""" Test Keras Model """
training = False
if training:
    model = train()
else:
    model = word2vec.Word2Vec.load("word2vec-gensim.model")

embedding_matrix = create_embedding_matrix(model)
keras_model(embedding_matrix, model.wv)

Nearest to first: second, last, youngest, fourth, fifth, earliest, sixth, next,
Nearest to from: forcibly, therefrom, via, onto, aravalli, across, back, gradually,
Nearest to some: many, certain, various, numerous, several, those, few, these,
Nearest to no: nothing, none, little, neither, any, hardly, nobody, whatever,
Nearest to d: b, politician, theodore, sar, malan, harpsichordist, swimmer, playwright,
Nearest to system: systems, filesystem, scheme, network, framework, mechanism, program, apparatus,
Nearest to nine: eight, seven, six, one, four, five, three, october,
Nearest to this: it, which, that, the, a, what, itself, whatever,
Nearest to while: whilst, although, though, whereas, however, thus, but, nevertheless,
Nearest to over: across, nearly, around, approximately, throughout, intervening, ago, during,
Nearest to used: employed, applied, utilized, utilised, invoked, uses, preferred, useful,
Nearest to th: fifteenth, fourteenth, sixteenth, seventeenth, eleventh, nd, eighteenth

In [18]:
""" Test Tensorflow Model """
training = False
if training:
    model = train()
else:
    model = word2vec.Word2Vec.load("word2vec-gensim.model")

embedding_matrix = create_embedding_matrix(model)
tensorflow_model(embedding_matrix, model.wv)

Nearest to american: canadian, carolina, gary, australian, cuban, america, dakota, bryan,
Nearest to this: it, which, that, the, a, what, itself, whatever,
Nearest to people: persons, residents, citizens, individuals, jews, inhabitants, americans, albanians,
Nearest to united: federated, reorganisation, isambard, polities, micronesia, axumite, oecs, warring,
Nearest to up: ablaze, ups, off, alight, down, aside, contours, indentation,
Nearest to s: sixties, his, seventies, vought, janet, stanley, christine, ted,
Nearest to would: might, will, could, should, shall, must, did, seemed,
Nearest to most: fairly, extremely, more, highly, less, particularly, vitally, quite,
Nearest to will: would, must, should, shall, might, could, can, may,
Nearest to called: termed, referred, valpara, dubbed, named, known, abbreviated, spelled,
Nearest to have: having, has, had, possess, exhibit, ve, contain, imply,
Nearest to was: is, were, became, had, fell, remained, came, remains,
Nearest to that: which,

In [None]:
""" Test Gensim """
training = False
if training:
    model = train()
else:
    model = word2vec.Word2Vec.load("word2vec-gensim.model")

# get the word vector of "the"
print(model.wv['the'])

# get the most common words
print(model.wv.index2word[0], model.wv.index2word[1], model.wv.index2word[2])

# get the least common words
vocab_size = len(model.wv.vocab)
print(model.wv.index2word[vocab_size - 1], model.wv.index2word[vocab_size - 2], model.wv.index2word[vocab_size - 3])

# find the index of the 2nd most common word ("of")
print('Index of "of" is: {}'.format(model.wv.vocab['of'].index))

# some similarity fun
print(model.wv.similarity('woman', 'man'), model.wv.similarity('man', 'elephant'))

# what doesn't fit?
print(model.wv.doesnt_match("green blue red zebra".split()))