In [1]:
# Loading libraries
import sys, os, json, numpy

# Loading local library
rootPath = os.path.realpath(os.path.dirname(os.getcwd()))
libraryPath = os.path.join(rootPath, 'src', 'library')
sys.path.append(libraryPath)


import pandas


word2index = {}
index2word = {}

def loadVocabFromSequence(vocabPath):
    vocabReader = pandas.read_csv(vocabPath, header=None)
    count = 1
    for (index, sequence) in vocabReader.iterrows():
        words = sequence[1].split()
        for word in words:
            if word not in word2index:
                word2index[word] = count
                index2word[count] = word
                count += 1
    
    return
    

def transferSequence(path):
    numberSequences = []
    vocabReader = pandas.read_csv(vocabPath, header=None)
    for (index, sequence) in vocabReader.iterrows():
        words = sequence[1].split()
        currentSequence = []
        for word in words:
            currentSequence.append(word2index[word])
        numberSequences.append(currentSequence)
        
    return numberSequences
    
vocabPath = os.path.join(rootPath, 'data', 'bhot', 'vocab.csv')
sequencePath = vocabPath = os.path.join(rootPath, 'data', 'bhot', 'train.csv')
loadVocabFromSequence(sequencePath)

numberSequences = transferSequence(sequencePath)
print(numberSequences)
allVocabSize = len(word2index)
    

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14, 10, 5, 15, 16], [17, 18, 15, 19, 16, 20, 14, 21, 9, 10, 5], [22, 15, 4, 8, 5], [23, 9, 24, 15], [9, 25, 26, 6], [10, 27, 28, 29, 30, 9, 31, 32, 33, 34], [35, 36, 37, 38, 39, 28, 40, 41, 42], [43, 44, 45, 32], [6, 31, 46, 47, 48], [49, 9, 4, 50, 51, 52, 53], [54, 55, 31, 56, 57, 58, 26, 50, 51], [59, 60, 30, 38, 61, 62, 63, 64, 65], [66, 67, 68, 22, 29, 30, 9, 69, 4, 70, 71], [70, 72, 14, 73, 74], [9, 75, 76, 77, 78], [77, 79, 20, 80, 81, 82, 9, 15, 14], [77, 81, 83, 84], [85, 81, 9, 77], [77, 81, 20, 86, 87, 88], [77, 81, 20, 86, 87, 89, 90], [4, 9, 76], [9, 77, 73], [91, 4, 92], [93, 77, 94, 41, 95, 96], [97, 9, 98, 99, 100, 81, 84, 101, 88, 102, 103], [77, 104, 87, 105, 89], [99, 106, 98], [107, 81], [108, 109, 23, 9], [20, 110], [100, 16, 111, 112, 14, 83, 113, 9], [114, 115, 116, 69, 117, 118, 91, 4, 77, 92, 12], [77, 102, 103, 20, 83, 84, 119, 120, 121, 122], [121, 119, 102, 103, 123, 124, 125, 126, 127, 128], [4, 70, 129, 102, 10

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Model

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
#https://raw.githubusercontent.com/adventuresinML/adventures-in-ml-code/master/keras_word2vec.py


In [3]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)





Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


In [4]:
window_size = 3
vector_dim = 300
epochs = 1

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)

print(sampling_table)


[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558  0.0171136  0.01822533
 0.01929662 0.02033198 0.02133515 0.02230924 0.02325687 0.02418031
 0.02508148 0.02596208 0.02682359 0.02766731 0.02849441 0.02930593
 0.03010279 0.03088585 0.03165585 0.0324135  0.03315943 0.0338942
 0.03461837 0.03533241 0.03603678 0.0367319  0.03741815 0.03809591
 0.0387655  0.03942724 0.04008143 0.04072834 0.04136824 0.04200136
 0.04262794 0.0432482  0.04386234 0.04447055 0.04507302 0.04566992
 0.04626142 0.04684768 0.04742884 0.04800505 0.04857644 0.04914315
 0.04970529 0.05026299 0.05081636 0.0513655  0.05191052 0.05245153
 0.05298861 0.05352186 0.05405136 0.05457721 0.05509948 0.05561824
 0.05613359 0.05664558 0.05715429 0.05765979 0.05816214 0.05866141
 0.05915765 0.05965093 0.06014131 0.06062883 0.06111355 0.06159553
 0.06207481 0.06255144 0.06302548 0.06349696 0.06396593 0.06443243
 0.0648965  0.0653582  0.06581754 0.06627458 0.06672936 0.06718

In [5]:
couples, labels = tf.keras.preprocessing.sequence.skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10])
print('-----')
print(labels[:10])
print('-----')
print(word_target[:10])
print('-----')
print(word_context[:10])
print('-----')




[[18, 243], [70, 51], [51, 54], [4, 115], [390, 2], [272, 442], [325, 63], [3, 477], [278, 21], [89, 166]]
-----
[0, 1, 1, 0, 1, 0, 1, 0, 0, 0]
-----
[ 18  70  51   4 390 272 325   3 278  89]
-----
[243  51  54 115   2 442  63 477  21 166]
-----


In [6]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))
print(input_target)
print(input_context)

Tensor("input_1:0", shape=(?, 1), dtype=float32)
Tensor("input_2:0", shape=(?, 1), dtype=float32)


In [7]:
embedding = Embedding(vocab_size, vector_dim,  name='embedding')
print(embedding)
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
print(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
print(context)

<tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f1eb893def0>
Tensor("reshape/Reshape:0", shape=(?, 300, 1), dtype=float32)
Tensor("reshape_1/Reshape:0", shape=(?, 300, 1), dtype=float32)


In [8]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = concatenate([target, context], axis=0)

# now perform the dot product operation to get a similarity measure
dot_product = concatenate([target, context], axis=1)
dot_product = Reshape((1,))(dot_product)


# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)


In [9]:

class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
    
sim_cb = SimilarityCallback()

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))

for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


InvalidArgumentError: Input to reshape is a tensor with 600 values, but the requested shape has 1
	 [[{{node reshape_2/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _class=["loc:@training/RMSprop/gradients/reshape_2/Reshape_grad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](concatenate_1/concat, reshape_2/Reshape/shape)]]