In [160]:
import tensorflow as tf

In [62]:
SENTENCES = ["machine learning engineers can build great data models",
             "the more data you have the better your model",
             "these predictions sound right, but it is all about your data",
             "your data can provide great value"
            ]

In [152]:
from collections import Counter
import json
import numpy as np
class Vocabulary:
    
    def __init__(self, vocabulary, wordFrequencyFilePath):
        self.vocabulary = vocabulary
        self.BAG_OF_WORDS_FILE_FULL_PATH = wordFrequencyFilePath
        self.input_word_index = {}
        self.reverse_input_word_index = {}
        
        self.input_word_index["START"] = 1
        self.input_word_index["UNKOWN"] = -1
        self.MaxSentenceLength = None
        
    def PrepareVocabulary(self,reviews):
        self._prepare_Bag_of_Words_File(reviews)
        self._create_Vocab_Indexes()
        
        self.MaxSentenceLength = max([len(txt.split(" ")) for txt in reviews])
      
    def Get_Top_Words(self, number_words = None):
        if number_words == None:
            number_words = self.vocabulary
        
        chars = json.loads(open(self.BAG_OF_WORDS_FILE_FULL_PATH).read())
        counter = Counter(chars)
        most_popular_words = {key for key, _value in counter.most_common(number_words)}
        return most_popular_words
    
    def _prepare_Bag_of_Words_File(self,reviews):
        counter = Counter()    
        for s in reviews:
            counter.update(s.split(" "))
            
        with open(self.BAG_OF_WORDS_FILE_FULL_PATH, 'w') as output_file:
            output_file.write(json.dumps(counter))
                 
    def _create_Vocab_Indexes(self):
        INPUT_WORDS = self.Get_Top_Words(self.vocabulary)

        #word to int
        #self.input_word_index = dict(
        #    [(word, i) for i, word in enumerate(INPUT_WORDS)])
        for i, word in enumerate(INPUT_WORDS):
            self.input_word_index[word] = i
        
        #int to word
        #self.reverse_input_word_index = dict(
        #    (i, word) for word, i in self.input_word_index.items())
        for word, i in self.input_word_index.items():
            self.reverse_input_word_index[i] = word

        #self.input_word_index = input_word_index
        #self.reverse_input_word_index = reverse_input_word_index
        #seralize.dump(config.DATA_FOLDER_PATH+"input_word_index.p",input_word_index)
        #seralize.dump(config.DATA_FOLDER_PATH+"reverse_input_word_index.p",reverse_input_word_index)
        
        
    def _word_to_One_Hot_Vector(self, word):
        vector = np.zeros(self.vocabulary)
        vector[vocab.input_word_index[word]] = 1
        return vector
        
    def TransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))

            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
                    #vector[t] = 2 #unk
            vectors.append(vector)
            
        return vectors
    
    def ReverseTransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))

            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
                    #vector[t] = 2 #unk
            vectors.append(vector)
            
        return vectors
    
    
    def Get_SkipGram_Target_Words(self, sentences, WINDOW_SIZE = 5):
        SKIP_GRAM_INPUT_WORD_LIST = []
        
        for sentence in sentences:
            sentence_tokenized = sentence.split(" ")
            
            for index, target_word in enumerate(sentence_tokenized):
                FROM_INDEX = max(index-WINDOW_SIZE,0)
                TO_INDEX = min(index+1+WINDOW_SIZE,len(sentence_tokenized))

                for contextWord in sentence_tokenized[FROM_INDEX:TO_INDEX]:
                    if contextWord != target_word:
                        SKIP_GRAM_INPUT_WORD_LIST.append((target_word,contextWord))
                    
        return SKIP_GRAM_INPUT_WORD_LIST
    
    
    def Get_SkipGram_Target_Words_OneHotEncoded_XY(self, sentences, WINDOW_SIZE = 5):
        Skip_Gram_Target_Words = self.Get_SkipGram_Target_Words(sentences, WINDOW_SIZE)
        
        X,Y = [],[]
        
        for target_word, context_word in Skip_Gram_Target_Words:
            X.append(self._word_to_One_Hot_Vector(target_word))
            Y.append(self._word_to_One_Hot_Vector(context_word))
            
        return np.asarray(X), np.asarray(Y)

In [159]:
VOCABULARY_SIZE = 26
vocab = Vocabulary(VOCABULARY_SIZE,"words.vocab")
vocab.PrepareVocabulary(SENTENCES)
vocab.Get_Top_Words(5)
print("Vocabulary of {0} words".format(len(vocab.Get_Top_Words())))

Vocabulary of 26 words


In [154]:
SENTENCES[0]

'machine learning engineers can build great data models'

In [155]:
Skip_Gram_Target_Words = vocab.Get_SkipGram_Target_Words(SENTENCES, WINDOW_SIZE=3)

In [156]:
for target, context in Skip_Gram_Target_Words:
    print(target)

machine
machine
machine
learning
learning
learning
learning
engineers
engineers
engineers
engineers
engineers
can
can
can
can
can
can
build
build
build
build
build
build
great
great
great
great
great
data
data
data
data
models
models
models
the
the
the
more
more
more
more
data
data
data
data
data
you
you
you
you
you
you
have
have
have
have
have
have
the
the
the
the
the
the
better
better
better
better
better
your
your
your
your
model
model
model
these
these
these
predictions
predictions
predictions
predictions
sound
sound
sound
sound
sound
right,
right,
right,
right,
right,
right,
but
but
but
but
but
but
it
it
it
it
it
it
is
is
is
is
is
is
all
all
all
all
all
all
about
about
about
about
about
your
your
your
your
data
data
data
your
your
your
data
data
data
data
can
can
can
can
can
provide
provide
provide
provide
provide
great
great
great
great
value
value
value


In [157]:
X_train, Y_train = vocab.Get_SkipGram_Target_Words_OneHotEncoded_XY(SENTENCES,2)

In [158]:
print(X_train.shape)
print(Y_train.shape)

(112, 26)
(112, 26)


In [None]:
VOCABULARY_SIZE

In [169]:
EMBEDDING_DIM = 5

# Inputs
X = tf.placeholder("float", shape=[None, VOCABULARY_SIZE])
y = tf.placeholder("float", shape=[None, VOCABULARY_SIZE])

# Dictionary of Weights and Biases
weights = {
  'W1': tf.Variable(tf.random_normal([VOCABULARY_SIZE, EMBEDDING_DIM])),
  'W2': tf.Variable(tf.random_normal([EMBEDDING_DIM, VOCABULARY_SIZE])),
}

biases = {
  'b1': tf.Variable(tf.random_normal([EMBEDDING_DIM])),
  'b2': tf.Variable(tf.random_normal([VOCABULARY_SIZE])),
}


# Model Forward Propagation step
def forward_propagation(x):
    hidden_1 = tf.add(tf.matmul(x, weights['W1']), biases['b1'])   
    out_layer = tf.add(tf.matmul(hidden_1, weights['W2']), biases['b2'])
    
    softmax_out = tf.nn.softmax(out_layer)    
    return softmax_out

#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat))
#optimizer = tf.train.GradientDescentOptimizer(learning_rate)

yhat = forward_propagation(X)
ypredict = tf.argmax(yhat, axis=1)

# Backward propagation
learning_rate = 0.2
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

train_op = optimizer.minimize(cost)

In [184]:
# Initializing the variables
init = tf.global_variables_initializer()

from datetime import datetime
startTime = datetime.now()

with tf.Session() as sess:
    sess.run(init)
    
    #writer.add_graph(sess.graph)
    #EPOCHS
    for epoch in range(1000):
        #Stochasting Gradient Descent
        for i in range(len(X_train)):
            #cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y * tf.log(yhat), reduction_indices=[1]))
            summary = sess.run(train_op, feed_dict={X: X_train[i: i + 1], y: Y_train[i: i + 1]})
        
        if epoch % 50 == 0:
            train_accuracy = np.mean(np.argmax(Y_train, axis=1) == sess.run(ypredict, feed_dict={X: X_train, y: Y_train}))
            #cross_entropy_loss_val = sess.run(cross_entropy_loss, feed_dict={X: X_train, y: Y_train})
            
            print("Epoch = %d, train accuracy = %.2f%%" % (epoch + 1, 100. * train_accuracy))
            #print("Epoch = %d, train accuracy = %.2f%%, train accuracy = %.2f%%" % (epoch + 1, 100. * train_accuracy, cross_entropy_loss_val))

    sess.close()
print("Time taken:", datetime.now() - startTime)

Epoch = 1, train accuracy = 6.25%
Epoch = 51, train accuracy = 21.43%
Epoch = 101, train accuracy = 21.43%
Epoch = 151, train accuracy = 23.21%
Epoch = 201, train accuracy = 24.11%
Epoch = 251, train accuracy = 24.11%
Epoch = 301, train accuracy = 24.11%
Epoch = 351, train accuracy = 24.11%
Epoch = 401, train accuracy = 24.11%
Epoch = 451, train accuracy = 25.00%
Epoch = 501, train accuracy = 25.00%
Epoch = 551, train accuracy = 25.00%
Epoch = 601, train accuracy = 25.00%


KeyboardInterrupt: 