In [3]:
# -*- coding: utf-8 -*-

%load_ext autoreload
%matplotlib inline
%autoreload 2

from __future__ import absolute_import

import sys
sys.path.insert(0, '../')

import os

os.environ['MKL_NUM_THREADS'] = '4'
os.environ['GOTO_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'

import theano
theano.config.openmp = True

os.environ['THEANO_FLAGS'] = 'device=cpu,blas.ldflags=-lblas -lgfortran'
# os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,optimizer=fast_compile,device=gpu0,floatX=float32"

import numpy as np
import theano.tensor as T
import six.moves.cPickle as pickle
import os, re, json

from keras.preprocessing import sequence, text
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils, generic_utils
from keras.models import Sequential
from keras.layers.core import Layer,Merge,Reshape,Dense,Dropout
from keras.layers.embeddings import Embedding
from keras import activations, initializations
from six.moves import range
from six.moves import zip

from utils_pack.utils import pickle_in,pickle_out,ensure_dir


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
max_features = 300 
skip_top = 0 
nb_epoch = 10
dim_proj = 256 

save = True
load_model = False
load_tokenizer = False
train_model = True
model_name = "model_1"


save_dir = os.path.join("../models","food_terms_embedding_models",model_name)
model_load_fname = "food2vec_model.pkl"
model_save_fname = "foo2vec_model.pkl"
tokenizer_fname = "food_tokenizer.pkl"
ensure_dir(save_dir)

# dataset_recipe_plus_ingredient_flat
data_path = os.path.join("../","datasets","flat_dataset_for_word_embeddings.pkl")

In [9]:
# def text_generator(path=data_path):

#     recipes = pickle_in(path)
#     recipes_flat =[]
#     for i, recipe in enumerate(recipes):
#         recipe_flat = " ".join([item for sublist in recipe for item in sublist])
#         recipes_flat.append(recipe_flat.encode('utf-8'))
#         print recipe_flat

#     for j,recipe in enumerate(recipes_flat):
#         if j % 10000 == 0:
#             print(j)
#         yield recipe

def text_generator(path=data_path):

    recipes_flat = pickle_in(path)

    for j,recipe in enumerate(recipes_flat):

        recipe = [item.encode("utf-8") for item in recipe]
        recipe = " ".join(recipe)
        if j % 100 == 0:
            print(j)
        yield recipe
        
def my_filter():
    f = "\n"
    return f

In [14]:
# model management
if load_tokenizer:
    print('Load tokenizer...')
    tokenizer = six.moves.cPickle.load(open(os.path.join(save_dir, tokenizer_fname), 'rb'))
else:
    print("Fit tokenizer...")
    tokenizer = text.Tokenizer(nb_words=max_features,filters=my_filter(),split=" ")
    tokenizer.fit_on_texts(text_generator())
    if save:
        print("Save tokenizer...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        pickle_out(os.path.join(save_dir, tokenizer_fname),tokenizer)

Fit tokenizer...
0
100
200
300
400
500
600
700
800
900
Save tokenizer...


In [11]:
print('Build model...')
word = Sequential()
word.add(Embedding(max_features,dim_proj,input_length=1,init='uniform'))

context = Sequential()
context.add(Embedding(max_features,dim_proj,input_length=1, init='uniform'))

model = Sequential()
model.add(Merge([word, context], mode='dot',dot_axes=2))
model.add(Reshape((1,), input_shape=(1,1)))
# model.add(Dense(1024))
model.compile(loss='mse', optimizer='rmsprop')

Build model...


In [12]:
# training process
if train_model:
    if load_model:
        print('Load model...')
        model = pickle.load(open(os.path.join(save_dir, model_load_fname), 'rb'))
          
    sampling_table = sequence.make_sampling_table(max_features)

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []
        
        for i, seq in enumerate(tokenizer.texts_to_sequences(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=50, negative_samples=10.)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                X1 = X[:,0]
                X2 = X[:,1]
                Y = np.array(labels, dtype="int32")
                
                loss = model.train_on_batch([X1,X2], Y)
                losses.append(loss)
                if len(losses) % 100 == 0:
                    progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        if save:
            print("Saving model...")
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            pickle_out(os.path.join(save_dir, model_save_fname),word)
        
        print('Samples seen:', samples_seen)
    print("Training completed!")

    if save:
        print("Saving model...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        pickle_out(os.path.join(save_dir, model_save_fname),word)

----------------------------------------
('Epoch', 0)
----------------------------------------
0
100
200
300
400
500
600
700
800
900

KeyboardInterrupt: 

In [60]:
# recover the embedding weights trained with skipgram:
weights = word.layers[0].get_weights()[0]

weights[:skip_top] = np.zeros((skip_top, dim_proj))
norm_weights = np_utils.normalize(weights)

word_index = tokenizer.word_index
reverse_word_index = dict([(v, k) for k, v in list(word_index.items())])


It's test time!


In [61]:
def embed_word(w):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return None
    return norm_weights[i]

def closest_to_point(point, nb_closest=10):
    proximities = np.dot(norm_weights, point)
    tups = list(zip(list(range(len(proximities))), proximities))
    tups.sort(key=lambda x: x[1], reverse=True)
    return [(reverse_word_index.get(t[0]), t[1]) for t in tups[:nb_closest]]  

def closest_to_word(w, nb_closest=10):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return []
    return closest_to_point(norm_weights[i].T, nb_closest)
def index_to_word(i):
    i = word_index.get(w)
    if (not i) or (i<skip_top) or (i>=max_features):
        return []
    return closest_to_point(norm_weights[i].T, nb_closest)

In [62]:
''' the resuls in comments below were for: 
    5.8M HN comments
    dim_proj = 256
    nb_epoch = 2
    optimizer = rmsprop
    loss = mse
    max_features = 50000
    skip_top = 100
    negative_samples = 1.
    window_size = 4
    and frequency subsampling of factor 10e-5. 
'''

words = [
"kg",
"1",
"vermelh",
]

for w in words:
    res = closest_to_word(w)
    print('====', w)
    for r in res:
        print(r)

('====', 'kg')
('kg', 1.0)
('sal', 0.79152799)
('alho', 0.78716141)
('bem', 0.78396344)
('quente.', 0.78109741)
('2', 0.7783398)
('cozinh', 0.77300835)
('sirv', 0.77262282)
('fic', 0.77066565)
('reserv', 0.76827341)
('====', '1')
('1', 0.99999994)
('de', 0.93757033)
('2', 0.8990761)
('sal', 0.87212569)
('3', 0.86868668)
('1/2', 0.86316025)
('em', 0.84614706)
('colher', 0.83985615)
('mistur', 0.83951569)
('4', 0.8302359)
('====', 'vermelh')
