In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '../')

import cPickle as pickle

import os
import json
from copy import copy

from itertools import groupby,chain,tee,izip,islice
from collections import Iterable,Counter
from operator import itemgetter 

import numpy as np
from random import shuffle

import re

import nltk
from nltk.stem import RSLPStemmer,SnowballStemmer
from nltk import PunktSentenceTokenizer,FreqDist
from nltk.corpus import stopwords
from nltk import UnigramTagger,BigramTagger

import gensim
from gensim.models import Word2Vec,Phrases

from glove import Glove,Corpus
from word_embeddings_utils import GloveExtended

import nlpnet

In [2]:
recipes_input_filepath = os.path.join("../datasets","quantity_detection_recipe_dataset.pkl")
in_s = open(recipes_input_filepath, 'rb')
dataset_recipes = pickle.load(in_s)

ingredients_input_filepath = os.path.join("../datasets","quantity_detection_ingredient_dataset.pkl")
in_s = open(ingredients_input_filepath, 'rb')
dataset_ingredients = pickle.load(in_s)

In [3]:
print len(dataset_recipes),dataset_recipes[:5][0]
print len(dataset_ingredients),dataset_ingredients[:5]

5244 [u'Coloque', u'o', u'piment\xe3o', u'lavado', u'sobre', u'a', u'boca', u'do', u'fog\xe3o', u'e', u'deixe', u'a', u'casca', u'queimar', u'bem.']
7895 [['number', u'g', u'lingui\xe7a', u'fina'], ['number', u'piment\xe3o', u'vermelho', u'grande'], ['number', u'cebolas', u'grandes'], ['number', u'colher', u'de', u'sopa', u'de', u'a\xe7\xfacar'], ['number', u'Azeite', u'o', u'quanto', u'baste']]


In [21]:
ingredients_super_flat = [word for sentence in dataset_ingredients for word in sentence]
BigramModel = Phrases(ingredients_super_flat)
ingredients_bigrams = list(BigramModel[dataset_ingredients])

In [None]:
bigram_counter = Counter()
for key in BigramModel.vocab.keys():
    if len(key.split("_")) > 1:
        bigram_counter[key] += BigramModel.vocab[key]
 
for key, counts in bigram_counter.most_common(20):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)

In [23]:
print ingredients_bigrams[:100]

[[u'number', u'g', u'lingui\xe7a', u'fina'], [u'number', u'piment\xe3o', u'vermelho', u'grande'], [u'number', u'cebolas', u'grandes'], [u'number', u'colher', u'de', u'sopa', u'de', u'a\xe7\xfacar'], [u'number', u'Azeite', u'o', u'quanto', u'baste'], [u'number', u'Sal', u'e', u'pimenta', u'a', u'gosto'], [u'number', u'x\xedcaras', u'de', u'ch\xe1', u'de', u'sobras', u'de', u'arroz'], [u'number', u'embalagem', u'de', u'requeij\xe3o'], [u'number', u'abobrinha', u'italiana', u'ralada'], [u'number', u'ovos'], [u'number', u'colheres', u'de', u'sopa', u'de', u'azeite'], [u'number', u'cebola', u'picada'], [u'number', u'cenoura', u'ralada'], [u'number', u'Sal', u'e', u'pimenta', u'a', u'gosto'], [u'number', u'Salsa', u'picada', u'a', u'gosto'], [u'number', u'Queijo', u'parmes\xe3o', u'ralado', u'o', u'quanto', u'baste'], [u'number', u'de', u'capeletti', u'de', u'frango'], [u'number', u'kg', u'de', u'peito', u'de', u'frango', u'temperado', u'cozido', u'e', u'desfiado'], [u'number', u'cebola', u'

In [5]:
PosTaggerPt= nlpnet.POSTagger('../../pretrained_models/nlp/pos-pt/', language='pt')

In [10]:
SRLTaggerPt = nlpnet.SRLTagger('../../pretrained_models/nlp/srl-pt/', language='pt')

In [9]:
recipe_example = " ".join(dataset_recipes[:5][0])
pos_tagged_recipe_example = PosTaggerPt.tag(recipe_example)
print recipe_example
print tagged_recipe_example

Coloque o pimentão lavado sobre a boca do fogão e deixe a casca queimar bem.
[[(u'Coloque', u'V'), (u'o', u'ART'), (u'piment\xe3o', u'N'), (u'lavado', u'PCP'), (u'sobre', u'PREP'), (u'a', u'ART'), (u'boca', u'N'), (u'do', u'PREP+ART'), (u'fog\xe3o', u'N'), (u'e', u'KC'), (u'deixe', u'V'), (u'a', u'ART'), (u'casca', u'N'), (u'queimar', u'V'), (u'bem', u'ADV'), (u'.', u'PU')]]


In [13]:
sent = SRLTaggerPt.tag(recipe_example)[0]
print sent.arg_structures

[(u'Coloque', {u'A1': [u'o', u'piment\xe3o', u'lavado', u'sobre', u'a', u'boca', u'do', u'fog\xe3o'], u'V': [u'Coloque']}), (u'deixe', {u'A2': [u'a', u'casca'], u'AM-MNR': [u'queimar', u'bem'], u'V': [u'deixe']}), (u'queimar', {u'AM-MNR': [u'bem'], u'V': [u'queimar']})]


In [111]:
dataset = dataset_ingredients+dataset_recipes

In [112]:
corpus_quantity = Corpus() 
corpus_quantity.fit(dataset, window=1)

In [113]:
quantity_model = GloveExtended(no_components=50, learning_rate=0.0001) 
quantity_model.fit(corpus_quantity.matrix, epochs=1000, no_threads=4, verbose=False)

quantity_model.add_dictionary(corpus_quantity.dictionary)
quantity_model.most_similar("number",number=100)

[(u'-', 0.83939947124164938),
 (u'\xbd', 0.83263768532396887),
 (u'gram', 0.82187636441527601),
 (u'cop', 0.81925752011298048),
 (u'kg', 0.79661556845890835),
 (u'sal', 0.78721414667800083),
 (u'lat', 0.76406385245338859),
 (u'colh', 0.70208549368797157),
 (u'x\xedc', 0.70157768037901869),
 (u'ovos', 0.6915606790561446),
 (u'cebol', 0.66155030521267777),
 (u'piment\xe3', 0.65732325918517276),
 (u'azeit', 0.65100126852155793),
 (u'ml', 0.65092944219440174),
 (u'piment', 0.64201975968115699),
 (u'g', 0.63912377556693079),
 (u'dent', 0.62678802923894272),
 (u'cub', 0.6158073227412485),
 (u'litr', 0.59729974297700605),
 (u'temper', 0.58758848997725077),
 (u'tomat', 0.58446700231098847),
 (u'folh', 0.57832087041975533),
 (u'xic', 0.57685047602165784),
 (u'queij', 0.55894716079932205),
 (u'(miojo)', 0.54196991354248214),
 (u'fat', 0.53141783146130339),
 (u'gr.', 0.51927054235099024),
 (u'farinh', 0.51882519635791258),
 (u'ch\xe1', 0.51847990534419863),
 (u'leit', 0.51833594433429542),
 (u'gr

In [120]:
print quantity_model.most_similar("number",number=100)[6][0]

lat


In [121]:
corpus_time = Corpus() 
corpus_time.fit(dataset, window=10)

In [122]:
time_model = GloveExtended(no_components=100, learning_rate=0.0001) 
time_model.fit(corpus_time.matrix, epochs=1000, no_threads=4, verbose=False)

time_model.add_dictionary(corpus_time.dictionary)
time_model.most_similar("time",number=100)

[(u'deix', 0.95298017435120375),
 (u'cozinh', 0.93463576611616639),
 (u'coloqu', 0.89765717794407007),
 (u'minutos.', 0.89673666683107978),
 (u'fog', 0.88625581124055663),
 (u'\xe1gu', 0.88592309103092826),
 (u'lev', 0.87854671398454287),
 (u'junt', 0.87492069627761715),
 (u'acrescent', 0.87058073005157732),
 (u'panel', 0.86169029436174982),
 (u'carn', 0.8565783446145746),
 (u'minut', 0.85457349357506496),
 (u'mistur', 0.8516953032239708),
 (u'bem', 0.84912971750919064),
 (u'molh', 0.84708073858668365),
 (u'refog', 0.84657238213647867),
 (u'mex', 0.82463444088579396),
 (u'forn', 0.80161977251532834),
 (u'cald', 0.79916286259200819),
 (u'temper', 0.79593581918852019),
 (u'\xf3le', 0.7867173086686301),
 (u'leit', 0.78636616278325655),
 (u'adicion', 0.78365290137116872),
 (u'frit', 0.78296887030253148),
 (u'x\xedc', 0.77525710433883843),
 (u'cebol', 0.7735839285800119),
 (u'fic', 0.77351601839022766),
 (u'pouc', 0.77293078968772411),
 (u'mass', 0.75778476094245828),
 (u'-', 0.757404215939