#### source: https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_word2vec.py

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn

In [1]:
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing  import sequence
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.layers import Input, Dense, Embedding, Dot, Reshape, GlobalAveragePooling1D
from tensorflow.keras.models import Model

from urllib import request

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]

    """
    >>> Counter('abracadabra').most_common(3)
    [('a', 5), ('r', 2), ('b', 2)]
    note: https://kite.com/python/docs/collections.Counter.most_common
    """
    count.extend(collections.Counter(words).most_common(n_words - 1))
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:10])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

In [2]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:10])

Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


In [3]:
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(get_available_devices())

with tf.device('/device:CPU:0'):
    window_size = 3
    vector_dim = 300
    epochs = 200000

    valid_size = 16     # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    sampling_table  = sequence.make_sampling_table(vocab_size)
    couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target  = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")

    print(couples[:10], labels[:10])

Num GPUs Available:  0
['/device:CPU:0', '/device:XLA_CPU:0']
[[2707, 399], [1886, 672], [9860, 7703], [1128, 143], [129, 356], [1550, 6], [5, 202], [1680, 1], [5291, 6], [1007, 1005]] [0, 1, 0, 1, 1, 1, 1, 1, 1, 1]


In [13]:
with tf.device('/device:CPU:0'):
    # create some input variables
    input_target  = Input((1,))
    input_context = Input((1,))

    embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
    
    target  = embedding(input_target)
    target  = Reshape((vector_dim, 1))(target)
    
    context = embedding(input_context)
    context = Reshape((vector_dim, 1))(context)

    # setup a cosine similarity operation which will be output in a secondary model
    # normalize: Whether to L2-normalize samples along the dot product axis before taking the dot product.
    # If set to True, then the output of the dot product is the cosine proximity between the two samples.
    similarity  = Dot(name="Cosine-Similarity", axes=1, normalize=True)([target, context])

    # now perform the dot product operation to get a similarity measure
    dot_product = Dot(name="Dot-Product", axes=1)([target, context])
    dot_product = Reshape((1,))(dot_product)

    # add the sigmoid output layer
    output = Dense(1, activation='sigmoid')(dot_product)

    # create the primary training model
    model = Model([input_target, input_context], output)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    # create a secondary validation model to run our similarity checks during training
    validation_model = Model([input_target, input_context], similarity)

    class SimilarityCallback:
        def run_sim(self):
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                sim = self._get_sim(valid_examples[i])
                nearest = (-sim).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)

        @staticmethod
        def _get_sim(valid_word_idx):
            sim = np.zeros((vocab_size,))
            in_arr1 = np.zeros((1,))
            in_arr2 = np.zeros((1,))
            in_arr1[0,] = valid_word_idx
            for i in range(vocab_size):
                in_arr2[0,] = i
                out = validation_model.predict_on_batch([in_arr1, in_arr2])
                sim[i] = out
            return sim
    sim_cb = SimilarityCallback()

    arr_1 = np.zeros((1,))
    arr_2 = np.zeros((1,))
    arr_3 = np.zeros((1,))
    for cnt in range(epochs):
        idx = np.random.randint(0, len(labels)-1)
        arr_1[0,] = word_target[idx]
        arr_2[0,] = word_context[idx]
        arr_3[0,] = labels[idx]
        loss = model.train_on_batch([arr_1, arr_2], arr_3)
        if cnt % 100 == 0:
            print("Iteration {}, loss={}".format(cnt, loss))
        if cnt % 10000 == 0:
            sim_cb.run_sim()

Iteration 0, loss=0.6930942535400391
Nearest to he: royal, shipping, harm, revenues, taiwan, falkland, klan, carlos,
Nearest to its: capitals, chronology, midway, order, banned, penguin, dinosaur, pacific,
Nearest to no: intentions, principal, structured, flat, nicholas, most, extend, maintained,
Nearest to states: consequences, probability, rely, accept, main, supporter, hunting, gives,
Nearest to new: opposition, louisiana, money, wilderness, ron, poles, tract, manx,
Nearest to most: cicero, bios, jungle, rivals, slang, announces, victory, anchor,
Nearest to history: bow, locations, humor, tips, achieving, classics, coaches, karate,
Nearest to than: decree, defender, mann, meetings, offence, kidney, repair, string,
Nearest to only: concrete, situation, sweet, driven, laureate, jane, pay, murders,
Nearest to use: mr, lover, abu, interestingly, carnegie, male, miss, revolution,
Nearest to after: pronouns, sand, patrick, clients, amateur, barrier, surname, periods,
Nearest to this: biol

Iteration 13600, loss=0.6763207912445068
Iteration 13700, loss=0.684822142124176
Iteration 13800, loss=0.7205579280853271
Iteration 13900, loss=0.7096936702728271
Iteration 14000, loss=0.7049404978752136
Iteration 14100, loss=0.6798832416534424
Iteration 14200, loss=0.7217393517494202
Iteration 14300, loss=0.6728202700614929
Iteration 14400, loss=0.6735886335372925
Iteration 14500, loss=0.6629728674888611
Iteration 14600, loss=0.673408031463623
Iteration 14700, loss=0.7014875411987305
Iteration 14800, loss=0.6548048257827759
Iteration 14900, loss=0.7199817299842834
Iteration 15000, loss=0.6712591648101807
Iteration 15100, loss=0.6569392681121826
Iteration 15200, loss=0.7153827548027039
Iteration 15300, loss=0.7247666120529175
Iteration 15400, loss=0.696858823299408
Iteration 15500, loss=0.6765132546424866
Iteration 15600, loss=0.679037868976593
Iteration 15700, loss=0.6683524250984192
Iteration 15800, loss=0.6885229349136353
Iteration 15900, loss=0.7051867246627808
Iteration 16000, los

Nearest to its: nuremberg, winter, elementary, gray, italy, mythological, weekend, capitals,
Nearest to no: minutes, intentions, couples, lowercase, changes, religious, rounded, finalist,
Nearest to states: supporter, main, frame, consequences, brandenburg, colleges, erasmus, monetary,
Nearest to new: catalyst, governing, opposition, mall, peculiar, hybrid, money, tract,
Nearest to most: shoot, jungle, uses, sure, render, cdot, documented, social,
Nearest to history: construct, sensitive, bow, month, scriptures, locations, published, exist,
Nearest to than: labor, moldova, offence, revolutionary, decree, anxiety, intensity, allocation,
Nearest to only: au, prefixes, engineer, tiles, echo, reduced, heterosexual, close,
Nearest to use: braille, initially, sparked, database, ed, afrikaans, seattle, male,
Nearest to after: demons, permit, slide, mao, careers, ethiopia, equivalence, surname,
Nearest to this: eighteenth, fallacy, completed, automatic, fascist, entitled, comments, dolphin,
Ne

Iteration 43300, loss=0.6648409962654114
Iteration 43400, loss=0.699459433555603
Iteration 43500, loss=0.6356370449066162
Iteration 43600, loss=0.700016438961029
Iteration 43700, loss=0.664323091506958
Iteration 43800, loss=0.6726238131523132
Iteration 43900, loss=0.6708515286445618
Iteration 44000, loss=0.6929710507392883
Iteration 44100, loss=0.681389570236206
Iteration 44200, loss=0.6693665981292725
Iteration 44300, loss=0.7102120518684387
Iteration 44400, loss=0.6916794776916504
Iteration 44500, loss=0.7028466463088989
Iteration 44600, loss=0.683856725692749
Iteration 44700, loss=0.7026638388633728
Iteration 44800, loss=0.6788061261177063
Iteration 44900, loss=0.6671589016914368
Iteration 45000, loss=0.731164276599884
Iteration 45100, loss=0.6577784419059753
Iteration 45200, loss=0.682064414024353
Iteration 45300, loss=0.6747230291366577
Iteration 45400, loss=0.6693756580352783
Iteration 45500, loss=0.6637236475944519
Iteration 45600, loss=0.7118690609931946
Iteration 45700, loss=0

Iteration 59900, loss=0.6932313442230225
Iteration 60000, loss=0.7156658172607422
Nearest to he: qualities, execution, harm, kilometres, alabama, sets, samples, better,
Nearest to its: differently, arising, elementary, shooter, gray, nuremberg, coke, italy,
Nearest to no: structured, intentions, operate, rounded, ghana, perfect, geology, minority,
Nearest to states: frame, main, supporter, horse, colleges, inn, prepare, bulgarian,
Nearest to new: catalyst, exposed, depicted, deity, policies, borrowed, tract, health,
Nearest to most: quickly, jungle, terrain, designed, maine, hellenic, character, copper,
Nearest to history: exist, month, construct, camp, locations, sensitive, men, achieving,
Nearest to than: upset, morning, navigation, decree, labor, bolsheviks, valuable, guatemala,
Nearest to only: au, alternative, prefixes, diversity, engineer, hosts, drug, apartment,
Nearest to use: braille, initially, trip, male, sign, massacre, lover, stanley,
Nearest to after: above, necessary, ha

Iteration 73100, loss=0.66172194480896
Iteration 73200, loss=0.5004945993423462
Iteration 73300, loss=0.6990578174591064
Iteration 73400, loss=0.6742755770683289
Iteration 73500, loss=0.655043363571167
Iteration 73600, loss=0.7057384252548218
Iteration 73700, loss=0.7593495845794678
Iteration 73800, loss=0.6632805466651917
Iteration 73900, loss=0.6346601247787476
Iteration 74000, loss=0.6916735172271729
Iteration 74100, loss=0.6835635304450989
Iteration 74200, loss=0.713634729385376
Iteration 74300, loss=0.6079975366592407
Iteration 74400, loss=0.5449078679084778
Iteration 74500, loss=0.7220137715339661
Iteration 74600, loss=0.7357004880905151
Iteration 74700, loss=0.7564908266067505
Iteration 74800, loss=0.6802000403404236
Iteration 74900, loss=0.7070143818855286
Iteration 75000, loss=0.6446618437767029
Iteration 75100, loss=0.6870734691619873
Iteration 75200, loss=0.6860631704330444
Iteration 75300, loss=0.6668860912322998
Iteration 75400, loss=0.6694068312644958
Iteration 75500, los

Iteration 89700, loss=0.6758556962013245
Iteration 89800, loss=0.6855605840682983
Iteration 89900, loss=0.6854436993598938
Iteration 90000, loss=0.7092702984809875
Nearest to he: qualities, execution, origins, samples, volcanic, harm, cuisine, toll,
Nearest to its: arising, coke, differently, reaching, shooter, base, implement, celebration,
Nearest to no: theater, perfect, ground, architects, intentions, opponents, cellular, understanding,
Nearest to states: frame, descartes, teacher, ignorance, instead, just, supporter, sharp,
Nearest to new: catalyst, exposed, likely, depicted, showed, deity, tract, hybrid,
Nearest to most: quickly, reinforced, babylonian, garrison, hellenic, forgotten, terrain, spelling,
Nearest to history: achieving, exist, mix, favor, it, locations, attack, published,
Nearest to than: upset, telegraph, emissions, pit, cone, navigation, labor, valuable,
Nearest to only: prefixes, au, alternative, impressed, diversity, uses, order, copper,
Nearest to use: braille, a

Iteration 102700, loss=0.6160600185394287
Iteration 102800, loss=0.31516557931900024
Iteration 102900, loss=0.7134352326393127
Iteration 103000, loss=0.18726655840873718
Iteration 103100, loss=0.6553347110748291
Iteration 103200, loss=0.6654918193817139
Iteration 103300, loss=0.6623881459236145
Iteration 103400, loss=0.6455118656158447
Iteration 103500, loss=0.6976489424705505
Iteration 103600, loss=0.6878712773323059
Iteration 103700, loss=0.692064642906189
Iteration 103800, loss=0.6430392861366272
Iteration 103900, loss=0.26082906126976013
Iteration 104000, loss=0.6769164204597473
Iteration 104100, loss=0.6980845928192139
Iteration 104200, loss=0.6254203915596008
Iteration 104300, loss=0.6740819215774536
Iteration 104400, loss=0.6288440227508545
Iteration 104500, loss=0.664371907711029
Iteration 104600, loss=0.4717402160167694
Iteration 104700, loss=0.6088972091674805
Iteration 104800, loss=0.719887375831604
Iteration 104900, loss=0.775134801864624
Iteration 105000, loss=0.5078234672

Iteration 118900, loss=0.6072080731391907
Iteration 119000, loss=0.5598318576812744
Iteration 119100, loss=0.6730732917785645
Iteration 119200, loss=0.186274453997612
Iteration 119300, loss=0.6949883699417114
Iteration 119400, loss=0.6497392058372498
Iteration 119500, loss=0.6887871623039246
Iteration 119600, loss=0.5746380686759949
Iteration 119700, loss=0.7312875390052795
Iteration 119800, loss=0.6267072558403015
Iteration 119900, loss=0.763350784778595
Iteration 120000, loss=0.6053125858306885
Nearest to he: qualities, execution, samples, cuisine, harm, as, volcanic, elderly,
Nearest to its: s, as, reaching, coke, extends, implement, differently, arising,
Nearest to no: prompted, ground, understanding, theater, rooms, handbook, architects, geology,
Nearest to states: frame, consequences, descartes, inn, just, rest, speech, study,
Nearest to new: exposed, buried, catalyst, propaganda, ideas, tract, flowers, northwest,
Nearest to most: forgotten, reinforced, customer, corporation, ger

Iteration 132000, loss=0.6951497793197632
Iteration 132100, loss=0.79877108335495
Iteration 132200, loss=0.5621479153633118
Iteration 132300, loss=0.6612776517868042
Iteration 132400, loss=0.7653810381889343
Iteration 132500, loss=0.6460592746734619
Iteration 132600, loss=0.6525232195854187
Iteration 132700, loss=0.5296056270599365
Iteration 132800, loss=1.0853116512298584
Iteration 132900, loss=0.9797559976577759
Iteration 133000, loss=0.6671390533447266
Iteration 133100, loss=0.5982178449630737
Iteration 133200, loss=0.6433555483818054
Iteration 133300, loss=0.7431102395057678
Iteration 133400, loss=0.534487247467041
Iteration 133500, loss=0.703582763671875
Iteration 133600, loss=0.5067705512046814
Iteration 133700, loss=0.8687747716903687
Iteration 133800, loss=0.6943864226341248
Iteration 133900, loss=0.5553178787231445
Iteration 134000, loss=3.1699395179748535
Iteration 134100, loss=0.6546741724014282
Iteration 134200, loss=0.6085860729217529
Iteration 134300, loss=0.6968834400177

Iteration 148400, loss=0.6728002429008484
Iteration 148500, loss=0.7033297419548035
Iteration 148600, loss=0.591569721698761
Iteration 148700, loss=0.10758670419454575
Iteration 148800, loss=0.6526429653167725
Iteration 148900, loss=0.7003350853919983
Iteration 149000, loss=0.5339766144752502
Iteration 149100, loss=0.6343881487846375
Iteration 149200, loss=0.6653134226799011
Iteration 149300, loss=0.8442911505699158
Iteration 149400, loss=0.9942781329154968
Iteration 149500, loss=0.00022368499776348472
Iteration 149600, loss=0.6777206063270569
Iteration 149700, loss=0.8373792171478271
Iteration 149800, loss=0.6390858292579651
Iteration 149900, loss=0.5689640641212463
Iteration 150000, loss=0.5757935643196106
Nearest to he: qualities, execution, as, samples, rounds, elderly, header, fantastic,
Nearest to its: s, the, as, cameras, and, is, rated, was,
Nearest to no: proof, ground, email, the, prompted, couples, der, understanding,
Nearest to states: frame, palestinian, conditions, additi

Iteration 161800, loss=1.0925744771957397
Iteration 161900, loss=0.5604814887046814
Iteration 162000, loss=3.4154813289642334
Iteration 162100, loss=0.6660451889038086
Iteration 162200, loss=2.6587789058685303
Iteration 162300, loss=0.664470911026001
Iteration 162400, loss=1.2357183694839478
Iteration 162500, loss=0.7583786249160767
Iteration 162600, loss=0.5483333468437195
Iteration 162700, loss=0.8009257912635803
Iteration 162800, loss=0.7411982417106628
Iteration 162900, loss=0.6576951742172241
Iteration 163000, loss=0.0016213125782087445
Iteration 163100, loss=0.576993465423584
Iteration 163200, loss=0.03440719470381737
Iteration 163300, loss=0.9501850605010986
Iteration 163400, loss=0.7453412413597107
Iteration 163500, loss=0.553935170173645
Iteration 163600, loss=0.8294133543968201
Iteration 163700, loss=0.6777171492576599
Iteration 163800, loss=0.5805372595787048
Iteration 163900, loss=0.7109599113464355
Iteration 164000, loss=2.2940776348114014
Iteration 164100, loss=0.77947092

Iteration 178300, loss=0.5482428073883057
Iteration 178400, loss=0.6179858446121216
Iteration 178500, loss=0.6157529354095459
Iteration 178600, loss=0.9188893437385559
Iteration 178700, loss=0.6175139546394348
Iteration 178800, loss=0.6544832587242126
Iteration 178900, loss=0.5328007936477661
Iteration 179000, loss=0.6008332371711731
Iteration 179100, loss=0.9747010469436646
Iteration 179200, loss=0.7980356812477112
Iteration 179300, loss=0.8263567090034485
Iteration 179400, loss=0.7898591756820679
Iteration 179500, loss=0.574756383895874
Iteration 179600, loss=1.0465075969696045
Iteration 179700, loss=0.956778883934021
Iteration 179800, loss=0.5607296824455261
Iteration 179900, loss=0.6678834557533264
Iteration 180000, loss=0.6725419163703918
Nearest to he: as, qualities, execution, to, in, is, they, the,
Nearest to its: the, s, as, and, of, was, a, is,
Nearest to no: the, counties, are, proof, ground, email, and, continuing,
Nearest to states: introduced, addition, designated, condit

Iteration 191800, loss=0.5684466361999512
Iteration 191900, loss=0.8942978978157043
Iteration 192000, loss=0.8611772060394287
Iteration 192100, loss=0.38570141792297363
Iteration 192200, loss=0.9342044591903687
Iteration 192300, loss=0.07454065978527069
Iteration 192400, loss=0.0038240947760641575
Iteration 192500, loss=0.7795902490615845
Iteration 192600, loss=0.0006619805353693664
Iteration 192700, loss=0.9100363850593567
Iteration 192800, loss=0.860859215259552
Iteration 192900, loss=0.5071946382522583
Iteration 193000, loss=0.6599355936050415
Iteration 193100, loss=0.5126049518585205
Iteration 193200, loss=0.022657886147499084
Iteration 193300, loss=0.5156424641609192
Iteration 193400, loss=0.986146092414856
Iteration 193500, loss=0.6952467560768127
Iteration 193600, loss=0.6922338008880615
Iteration 193700, loss=0.41350483894348145
Iteration 193800, loss=1.2423057556152344
Iteration 193900, loss=0.6042304635047913
Iteration 194000, loss=0.00937556941062212
Iteration 194100, loss=0