In [2]:
from keras.preprocessing import sequence
import numpy as np
from collections import Counter
import itertools
import time
import pickle
import sets
import keras
from random import shuffle
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
documents = []
file = open("data/silva/freshwater/glove_input_freshwater.txt")
for line in file:
    line = line.strip()
    line = line.split("\t")
    documents.append(line)
    
dictionary = {}
reverse_dictionary = {}
unlisted = [i for sublist in documents for i in sublist]
freq = Counter(unlisted)
vocab_size = len(freq.keys())

i = 1
for tup in freq.most_common():
    dictionary[tup[0]] = i
    reverse_dictionary[i] = tup[0]
    i += 1

dict_file = open("dictionary.obj", "wb")
pickle.dump(dictionary, dict_file)

revdict_file = open("reverse_dictionary.obj", "wb")
pickle.dump(reverse_dictionary, revdict_file)

In [6]:
couples_pos_file = open("contexts/pos/couples_0.obj", "rb")
couples_pos = pickle.load(couples_pos_file)

labels_pos_file = open("contexts/pos/labels_0.obj", "rb")
labels_pos = pickle.load(labels_pos_file)

couples_neg = []
labels_neg = []
for i in range(100):
    couples_neg_file = open("contexts/neg/couples_" + str(i) + ".obj", "rb")
    tmp = pickle.load(couples_neg_file)
    couples_neg = couples_neg + tmp
    couples_neg_file.close()

    labels_neg_file = open("contexts/neg/labels_" + str(i) + ".obj", "rb")
    tmp = pickle.load(labels_neg_file)
    labels_neg = labels_neg + tmp
    labels_neg_file.close()

In [8]:
couples_use = couples_pos + couples_neg
labels_use = labels_pos + labels_neg
word_target, word_context = zip(*couples_use)

In [9]:
from keras.layers import Input, Dense, Embedding, Reshape, Dot
from keras.models import Model 
vector_dim = 500

input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name = "embedding")
target = embedding(input_target, name = "target_embed")
target = Reshape((vector_dim, 1))(target)

context = embedding(input_context, name = "context_embed")
context = Reshape((vector_dim, 1))(context)

similarity = Dot(axes = 0, normalize = True)([target, context]) # cosine similarity (end up with scalar)

dot_product = Dot(axes = 1, normalize = False)([target, context]) #Component-wise multiply
dot_product = Reshape((1,))(dot_product)

output = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_target, input_context], outputs = output)
model.compile(loss = "binary_crossentropy", optimizer = 'rmsprop')


validation_model = Model(inputs=[input_target, input_context], outputs = similarity)


In [24]:
valid_examples = [1,2,3,4,5]
reverse_dictionary[valid_examples[0]]
valid_size = 5

In [None]:
valid_word = reverse_dictionary[0]
close_word = reverse_dictionary[1]


In [25]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        for i in range(vocab_size):
            in_arr1[0,] = valid_word_idx
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [46]:
from math import ceil
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
    
epochs = 60
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels_use)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels_use[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 10 == 0: 
        # serialize weights to HDF5
        loss = ceil(loss * 10000) / 10000
        model.save_weights("weights/epoch{}-loss{}".format(cnt, loss) + ".h5")
        #print("Saved model to disk")
        print("epoch{}-loss{}".format(cnt, loss))
    #if cnt % 10000 == 0:
     #   sim_cb.run_sim()

epoch0-loss0.324
epoch10-loss0.313
epoch20-loss0.312
epoch30-loss0.322
epoch40-loss0.321
epoch50-loss0.304


In [40]:

filepath = "/weights/epoch{epoch:04d}-loss{loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only = True, mode='max')

model.fit(x=couples_use, y = labels_use, batch_size = 1, epochs = 10, verbose = 1, callbacks = [checkpoint])

KeyboardInterrupt: 

In [15]:
model

<keras.engine.training.Model at 0x22a40377ac8>