In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn

In [1]:
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing  import sequence
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Reshape
from tensorflow.keras.models import Model

from urllib import request

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]

    """
    >>> Counter('abracadabra').most_common(3)
    [('a', 5), ('r', 2), ('b', 2)]
    note: https://kite.com/python/docs/collections.Counter.most_common
    """
    count.extend(collections.Counter(words).most_common(n_words - 1))
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:10])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

In [2]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:10])

Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


In [None]:
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(get_available_devices())

with tf.device('/device:CPU:0'):
    window_size = 3
    vector_dim = 300
    epochs = 200000

    valid_size = 16     # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    sampling_table = sequence.make_sampling_table(vocab_size)
    couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target  = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")

    print(couples[:10], labels[:10])

Num GPUs Available:  0
['/device:CPU:0', '/device:XLA_CPU:0']


In [None]:
with tf.device('/device:CPU:0'):
    # create some input variables
    input_target  = Input((1,))
    input_context = Input((1,))

    embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
    target  = embedding(input_target)
    target  = Reshape((vector_dim, 1))(target)
    context = embedding(input_context)
    context = Reshape((vector_dim, 1))(context)

    # setup a cosine similarity operation which will be output in a secondary model
    # normalize: Whether to L2-normalize samples along the dot product axis before taking the dot product.
    # If set to True, then the output of the dot product is the cosine proximity between the two samples.
    similarity  = Dot(name="Cosine-Similarity", axes=1, normalize=True)([target, context])

    # now perform the dot product operation to get a similarity measure
    dot_product = Dot(name="Dot-Product", axes=1)([target, context])
    dot_product = Reshape((1,))(dot_product)

    # add the sigmoid output layer
    output = Dense(1, activation='sigmoid')(dot_product)

    # create the primary training model
    model = Model([input_target, input_context], output)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    # create a secondary validation model to run our similarity checks during training
    validation_model = Model([input_target, input_context], similarity)

    class SimilarityCallback:
        def run_sim(self):
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                sim = self._get_sim(valid_examples[i])
                nearest = (-sim).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)

        @staticmethod
        def _get_sim(valid_word_idx):
            sim = np.zeros((vocab_size,))
            in_arr1 = np.zeros((1,))
            in_arr2 = np.zeros((1,))
            in_arr1[0,] = valid_word_idx
            for i in range(vocab_size):
                in_arr2[0,] = i
                out = validation_model.predict_on_batch([in_arr1, in_arr2])
                sim[i] = out
            return sim
    sim_cb = SimilarityCallback()

    arr_1 = np.zeros((1,))
    arr_2 = np.zeros((1,))
    arr_3 = np.zeros((1,))
    for cnt in range(epochs):
        idx = np.random.randint(0, len(labels)-1)
        arr_1[0,] = word_target[idx]
        arr_2[0,] = word_context[idx]
        arr_3[0,] = labels[idx]
        loss = model.train_on_batch([arr_1, arr_2], arr_3)
        if cnt % 100 == 0:
            print("Iteration {}, loss={}".format(cnt, loss))
        if cnt % 10000 == 0:
            sim_cb.run_sim()