In [15]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '../')

import cPickle as pickle

import os
import json
from copy import copy

from itertools import groupby,chain,tee,izip,islice
from collections import Iterable,Counter
from operator import itemgetter 

import numpy as np
from random import shuffle

import re

import nltk
from nltk.stem import RSLPStemmer,SnowballStemmer
from nltk import PunktSentenceTokenizer,FreqDist
from nltk.corpus import stopwords
from nltk import UnigramTagger,BigramTagger

import gensim
from gensim.models import Word2Vec,Phrases

from glove import Glove,Corpus
from word_embeddings_utils import GloveExtended
from utils import pickle_in,pickle_out

import nlpnet

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
size_detection_dataset_input_filepath = os.path.join("../datasets",
                                      "size_quantity_detection_dataset.pkl")
dataset_size = pickle_in(size_detection_dataset_input_filepath)

time_detection_dataset_input_filepath = os.path.join("../datasets",
                                      "time_quantity_detection_dataset.pkl")
dataset_time= pickle_in(time_detection_dataset_input_filepath)

Quick sneek-peek at the size detection dataset shows that usually if there is a quantity object it is right after the synthetic "number" token. For instance 'g'(gram) or 'embalag' (box), however there are a lot of ingredient entries without any quantity unit descriptors which could cause trouble

In [25]:
print dataset_size[:20]

[['number', u'g', u'linguic', u'fin'], ['number', u'piment\xe3', u'vermelh', u'grand'], ['number', u'cebol', u'grand'], ['number', u'colh', u'sop', u'a\xe7\xfac'], ['number', u'azeit', u'quant', u'bast'], ['number', u'sal', u'piment', u'gost'], ['number', u'x\xedc', u'ch\xe1', u'sobr', u'arroz'], ['number', u'embalag', u'requeij\xe3'], ['number', u'abobrinh', u'italian', u'ral'], ['number', u'ovos'], ['number', u'colh', u'sop', u'azeit'], ['number', u'cebol', u'pic'], ['number', u'cenour', u'ral'], ['number', u'sal', u'piment', u'gost'], ['number', u'sals', u'pic', u'gost'], ['number', u'queij', u'parmes\xe3', u'ral', u'quant', u'bast'], ['number', u'capelett', u'frang'], ['number', u'kg', u'peit', u'frang', u'temper', u'coz', u'desfi'], ['number', u'cebol', u'ral'], ['number', u'tomat', u'picadinh']]


For the time detection dataset it looks like this

In [27]:
print dataset_time[:20]

[[u'coloqu', u'piment\xe3', u'lav', u'sobr', u'boc', u'fog\xe3', u'deix', u'casc', u'queim', u'bem.'], [u'enrol', u'piment\xe3', u'queim', u'film', u'pl\xe1stic', u'reserve.'], [u'lev', u'fog', u'cebol', u'cort', u'fin', u'fat', u'uniformes,', u'pouquinh', u'azeit', u'sal,', u'deix', u'cozinh', u'fog', u'baix', u'amolec', u'bem.'], [u'acrescent', u'a\xe7\xfac', u'cozinh', u'dour', u'bem.'], [u'junt', u'ent\xe3', u'linguic', u'cort', u'rodel', u'deix', u'fritando.'], [u'enquant', u'linguic', u'frita,', u'volt', u'piment\xe3o.'], [u'desembrulh', u'retir', u'casc', u'queimadas.'], [u'abra', u'piment\xe3o,', u'descart', u'sement', u'cort', u'tir', u'simil', u'cebolas.'], [u'quand', u'linguic', u'suficient', u'frit', u'dourada,'], [u'coloqu', u'piment\xe3', u'junt', u'mistura,', u'prov', u'temper', u'pronto.'], [u'sirv', u'p\xe3', u'com', u'palit', u'aperitivo.'], [u'rend', 'time', u'+', u'por\xe7\xf5es.'], [u'em', u'panel', u'aquec', u'azeite,', u'dour', u'cebol', u'refog', u'uns', u'minut

Now, I am going to assume that tokens that appear in both time and size detection datasets are neither the size quantity nor the time quantity since usually recipes don't have those redundancies. 
Let's filter them out from our datasets

Training glove word embedding for size quantity detection leads to 

In [73]:
corpus_quantity = Corpus() 
corpus_quantity.fit(dataset_size, window=1)

In [81]:
quantity_model = GloveExtended(no_components =100, learning_rate=0.1) 
quantity_model.fit(corpus_quantity.matrix, epochs=50, no_threads=4, verbose=False)

quantity_model.add_dictionary(corpus_quantity.dictionary)
quantity_model.most_similar("number",number=50)

[(u'gramas)', 0.96015643411324747),
 (u'g)', 0.95283309838405073),
 (u'gem', 0.93709072047249931),
 (u'bif', 0.90325127619706913),
 (u'espig', 0.89906845929814483),
 (u'gr', 0.8848908770304873),
 (u'piment\xf5', 0.88030365455301907),
 (u'ram', 0.87083760930605003),
 (u'clar', 0.83111686086328063),
 (u'*', 0.82471210571726594),
 (u'-', 0.82416811009722535),
 (u'mioj', 0.80911397677127173),
 (u'fil\xe9s', 0.79493378323661978),
 (u'oregan', 0.79253657874182615),
 (u'grs.', 0.78519189025405267),
 (u'vidr', 0.7795644956943889),
 (u'gr.', 0.77794779485645493),
 (u'banan', 0.77621702885450206),
 (u'par', 0.76152082161272849),
 (u'lagart', 0.75655904074719704),
 (u'ma\xe7\xe3s', 0.7564210697384276),
 (u'alumini', 0.74022919918950381),
 (u'costel', 0.7318445776450051),
 (u'abobrinh', 0.72235875276785544),
 (u'champanh', 0.7209697406450668),
 (u'xic', 0.71614795211140259),
 (u'mac', 0.71053189603488798),
 (u'american', 0.69662345896512712),
 (u'vermelhos,', 0.69573035383364279),
 (u'ml.', 0.6923

We can see that not everything is a size quantity. There are some words like 'piment'(pepper) or 'cebol'(onion). 

In [60]:
corpus_time = Corpus() 
corpus_time.fit(dataset_time, window=3)

In [82]:
time_model = GloveExtended(no_components=100, learning_rate=0.1) 
time_model.fit(corpus_time.matrix, epochs=10, no_threads=4, verbose=False)

time_model.add_dictionary(corpus_time.dictionary)
time_model.most_similar("time",number=100)

[(u'cerc', 0.99918856256396904),
 (u'minutos,', 0.99909796204584733),
 (u'descans', 0.99905329900624273),
 (u'uns', 0.9989386675003028),
 (u'horas.', 0.99853088885428498),
 (u'minutos.', 0.99847069236574526),
 (u'asse', 0.99791560319114814),
 (u'baix', 0.99781587765221358),
 (u'pot\xeanc', 0.99770131552444485),
 (u'por\xe7\xf5es.', 0.99745918252204835),
 (u'adicion', 0.99718600673857538),
 (u'rend', 0.99694340658199165),
 (u'pr\xe9-aquec', 0.99668322187832981),
 (u'dourar.', 0.99656006392365726),
 (u'rendimento:', 0.99654173956653447),
 (u'lev', 0.99618151437653379),
 (u'cubr', 0.99613998953185745),
 (u'fiqu', 0.99606069049340318),
 (u'tom', 0.99604159198826792),
 (u'minutos).', 0.9957816825418756),
 (u'despej', 0.99540384438344665),
 (u'marin', 0.99528416746278126),
 (u'coloc', 0.9952403081281388),
 (u'c', 0.99509767395373738),
 (u'restant', 0.99449449955938718),
 (u'm\xednim', 0.99441907631019344),
 (u'assar', 0.99390663373221078),
 (u'ret\xe2ngul', 0.99362096376850018),
 (u'por\xe7\

In [92]:
size_detection_model = [sq[0] for sq in quantity_model.most_similar("number",number=100)]
time_detection_model = [tq[0] for tq in time_model.most_similar("time",number=10)]
pickle_out(os.path.join("../models","quantity_detection_models","size_detection.pkl"),size_detection_model)
pickle_out(os.path.join("../models","quantity_detection_models","timee_detection.pkl"),time_detection_model)