In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '../')

import cPickle as pickle

import os
import json
from copy import copy

from itertools import groupby,chain,tee,izip,islice
from collections import Iterable,Counter
from operator import itemgetter 

import numpy as np
from random import shuffle

import re

import nltk
from nltk.stem import RSLPStemmer,SnowballStemmer
from nltk import PunktSentenceTokenizer,FreqDist
from nltk.corpus import stopwords
from nltk import UnigramTagger,BigramTagger

from utils_pack.utils import pickle_in,pickle_out

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read and clean data from crawlers 

In [3]:
filepath = os.path.join("../crawlers","cozinhabrasileira","cozinhabrasileira","spiders","items3.json")
data = []
with open(filepath) as json_file:
    for line in json_file:       
        if line[0]=="[":
            line = line[1:]
            line = line[:-2]
        elif line[-1]=="]":
            line = line[:-1]
        else:
            line = line[:-2]
        data.append(json.loads(line))

titles = []
ingredients = []
recipes = []
for entry in data:
    if entry['title'] ==[] or entry['ingredients'] ==[] or entry['recipe'] == []:
        continue
    
    titles.append(entry['title'])
    ingredients.append(entry['ingredients'])
    recipes.append(entry['recipe']) 
    
ingredients_cleaned = [[i.split() for i in ing] for ing in ingredients]

titles_tokens= [i[0].split() for i in titles]

recipes_cleaned = [[r.split() for r in rec if r.split()!=[]] for rec in recipes]

final_dataset = zip(titles,titles_tokens,ingredients_cleaned,recipes_cleaned)

The full dataset is just a python zipped list of titles,tokenized titles,tokenized ingredients and tokenized recipes

In [5]:
output_filepath = os.path.join("../datasets","full_dataset_cozinhabrasileira.pkl")
pickle_out(output_filepath,final_dataset)

In [6]:
input_filepath = os.path.join("../datasets","full_dataset_cozinhabrasileira.pkl")
dataset = pickle_in(input_filepath)
titles,titles_tokens,ingredients_cleaned,recipes_cleaned=zip(*dataset)

### Text preprocessing

With low number of data points stemming and dropping stop words is important to increse token frequencies in the dataset. 

In [7]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stemmer_Snow = SnowballStemmer("portuguese")

In [9]:
titles_stemmed = [[stemmer_Snow.stem(word) for word in title if word not in stopwords] 
                  for title in titles_tokens]
recipes_stemmed = [[[stemmer_Snow.stem(word) for word in sentence if word not in stopwords] 
                    for sentence in recipe] for recipe in recipes_cleaned]
ingredients_stemmed = [[[stemmer_Snow.stem(word) for word in sentence if word not in stopwords] 
                    for sentence in recipe] for recipe in ingredients_cleaned]

In [17]:
full_dataset_list = zip(titles,titles_stemmed,recipes_stemmed,ingredients_stemmed)
pickle_out(os.path.join("../datasets","full_dataset_list.pkl"),full_dataset_list)
full_dataset_dict={}
for t,ts,r,i in full_dataset_list:
    full_dataset_dict[t[0]] = {'title':ts,'recipe':r,'ingredient':i}
pickle_out(os.path.join("../datasets","full_dataset_dict.pkl"),full_dataset_dict)

Let's generate a flat corpus for word embedding training based on recipies ingredients and titles

In [22]:
flat_dataset = []
for t,ts,r,i in full_dataset_list:
    r = [item for sublist in r for item in sublist]
    i = [item for sublist in i for item in sublist]
    full_flat = r+i+ts
    flat_dataset.append(full_flat)
pickle_out(os.path.join("../datasets","flat_dataset_for_word_embeddings.pkl"),
           flat_dataset)

In order to detect a token that could be a quantity unit like pinch of salt or glass of water I will subtitute every digit in the ingredients dataset with the word "number" and then train the model to find the words that coexisted with the word "number" in the ingredient dataset. Similarly I will subsitute every digit in the recipe dataset with the word "time" to try and find time units. Usually the quantities are not specified within the recipe since they were listed in the ingredients. 

In [24]:
def contains_number(inputString):
    return any(char.isdigit() for char in inputString)

In [34]:
size_quant_detect_ingredient_dataset = [[["number" if contains_number(word) else word for word in sentence ] 
                               for sentence in recipe ] for recipe in ingredients_stemmed]

flat_quant_detect_ingredient_dataset= []
for recipe in size_quant_detect_ingredient_dataset:
    for ingredients in recipe:
        if "number" not in ingredients:
            ingredients = ["number"]+ingredients
        flat_quant_detect_ingredient_dataset.append(ingredients)
        
pickle_out(os.path.join("../datasets","size_quantity_detection_dataset.pkl"),flat_quant_detect_ingredient_dataset)

In [35]:
time_quant_detect_recipe_dataset = [[["time" if contains_number(word) else word for word in sentence ] 
                               for sentence in recipe ] for recipe in recipes_stemmed]

flat_quant_detect_recipe_dataset= []
for recipe in time_quant_detect_recipe_dataset:
    for ingredients in recipe:
        flat_quant_detect_recipe_dataset.append(ingredients)
pickle_out(os.path.join("../datasets","time_quantity_detection_dataset.pkl"),flat_quant_detect_recipe_dataset)