# Parsing and munging

In [1]:
from ingredient_parser.en import parse
import re
import ndjson

In [2]:
def rparse(i):
    pattern = r"""(?x)     # Use free-spacing mode.
        \s?\(.*\)\s?|   # something in parentheses with a space before/after
        ,.*$            # from a comma to the end of the string
        """
    return re.sub(pattern, "", i)

In [3]:
test_ingredients = [
      "1 pound skinless, boneless chicken breast halves - cut into 1 inch cubes",
      "1 pound andouille sausage, sliced",
      "1 28 ounce can diced tomatoes with juice",
      "1 large onion, chopped",
      "1 large green bell pepper, chopped",
      "1 cup chopped celery",
      "1 cup chicken broth",
      "2 teaspoons dried oregano",
      "2 teaspoons dried parsley",
      "2 teaspoons Cajun seasoning",
      "1 teaspoon cayenne pepper",
      "1/2 teaspoon dried thyme",
      "1 pound frozen cooked shrimp without tails",
      "2 cups (about 9 1/2 ounces) whole almonds, toasted"
]

In [4]:
files = ['ar', 'epi', 'fn']

for file_name in files:
    with open(f'data/recipes_raw_ndjson/recipes_raw_nosource_{file_name}.json') as f:
        data = ndjson.load(f)

    # gather ingredient lists, parse, and write to a new file.
    data_ingredients = [recipe['ingredients'] for recipe in data]

    data_ingredients = [[rparse(parse(ingredient)['name']).lower() for ingredient in ingredients] for ingredients in data_ingredients]

    with open(f'data/recipes_raw_ndjson/ingredients_{file_name}.txt', 'w') as f:
        for ingredients in data_ingredients:
            f.write(','.join(ingredients))
            f.write('\n')
    
    # gather instructions and write to a new file
    data_instructions = [recipe['instructions'] for recipe in data]
    
    with open(f'data/recipes_raw_ndjson/instructions_{file_name}.txt', 'w') as f:
        for instruction in data_instructions:
            f.write(instruction)
            f.write('\n')

In [5]:
[rparse(parse(i)['name']) for i in test_ingredients]

['skinless',
 'andouille sausage',
 'can diced tomatoes with juice',
 'large onion',
 'large green bell pepper',
 'chopped celery',
 'chicken broth',
 'dried oregano',
 'dried parsley',
 'Cajun seasoning',
 'cayenne pepper',
 'dried thyme',
 'frozen cooked shrimp without tails',
 'whole almonds']

# Text Mining

In [72]:
# open python and nltk packages needed for processing
import nltk
import re

filepath = 'data/recipes_raw_ndjson/instructions_all.txt'

In [73]:
# open the file, read the text and close it
f = open(filepath, 'r')
filetext = f.read()
f.close()

In [74]:
# tokenize by the regular word tokenizer
filetokens = nltk.word_tokenize(filetext)

In [75]:
print(len(filetokens))

25069841


In [76]:
print(filetokens[:30])

['Place', 'the', 'chicken', ',', 'butter', ',', 'soup', ',', 'and', 'onion', 'in', 'a', 'slow', 'cooker', ',', 'and', 'fill', 'with', 'enough', 'water', 'to', 'cover', 'Cover', ',', 'and', 'cook', 'for', '5', 'to', '6']


In [77]:
# choose to treat upper and lower case the same
#    by putting all tokens in lower case
filewords = [w.lower() for w in filetokens]

# display the first words
print ("Display first 30 words from file:")
print(len(filewords))
print (filewords[:30])

Display first 30 words from file:
25069841
['place', 'the', 'chicken', ',', 'butter', ',', 'soup', ',', 'and', 'onion', 'in', 'a', 'slow', 'cooker', ',', 'and', 'fill', 'with', 'enough', 'water', 'to', 'cover', 'cover', ',', 'and', 'cook', 'for', '5', 'to', '6']


In [78]:
# read a stop word file
fstop = open('data/smart.english.stop', 'r')
stoptext = fstop.read()
fstop.close()

stopwords = nltk.word_tokenize(stoptext)
stopwords.extend(['small', 'medium', 'large', 'medium-high', 'high', 'low'])
print ("Display first 50 Stopwords:")
print (stopwords[:50])

Display first 50 Stopwords:
["'s", 'a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking']


In [79]:
def alpha_filter(w):
    # pattern to match word of non-alphabetical characters
    pattern = re.compile('^[^a-z]+$')
    if (pattern.match(w)):
        return True
    else:
        return False

In [80]:
# setup to process bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(filewords)
# choose to use both the non-alpha word filter and a stopwords filter
finder.apply_word_filter(alpha_filter)
finder.apply_word_filter(lambda w: w in stopwords)

In [81]:
# score by frequency and display the top 50 bigrams
scored = finder.score_ngrams(bigram_measures.raw_freq)
print ()
print ("Bigrams from file with top 20 frequencies")
for item in scored[:20]:
        print (item)


Bigrams from file with top 20 frequencies
(('preheat', 'oven'), 0.0012645871986184515)
(('olive', 'oil'), 0.0012578460310139182)
(('baking', 'sheet'), 0.0011720058376118142)
(('golden', 'brown'), 0.0008317962606942741)
(('room', 'temperature'), 0.000818114482656671)
(('stirring', 'occasionally'), 0.0007419672107214401)
(('lemon', 'juice'), 0.0007202678309766703)
(('baking', 'dish'), 0.000595655951707073)
(('reduce', 'heat'), 0.0005919463150962944)
(('food', 'processor'), 0.0005651810875066977)
(('preheated', 'oven'), 0.00048719894154893126)
(('plastic', 'wrap'), 0.0004710440724374758)
(('degrees', 'f.'), 0.00044543561325339077)
(('brown', 'sugar'), 0.0004264486559767172)
(('black', 'pepper'), 0.00037977903409917917)
(('cool', 'completely'), 0.0003744738548601086)
(('teaspoon', 'salt'), 0.0003715619895634759)
(('paper', 'towels'), 0.00035744143730309256)
(('electric', 'mixer'), 0.0003572419944745561)
(('baking', 'powder'), 0.0003493041698988039)


In [82]:
# score by PMI and display the top 50 bigrams
# only use frequently occurring words in mutual information
finder.apply_freq_filter(5)
scored = finder.score_ngrams(bigram_measures.pmi)

print ("\nBigrams from file with top 20 mutual information scores")
for item in scored[:20]:
        print (item)


Bigrams from file with top 20 mutual information scores
(('alois', 'lageder'), 22.25752141554633)
(('beau', 'monde'), 22.25752141554633)
(('bedford', 'thompson'), 22.25752141554633)
(('coyote', 'bait'), 22.25752141554633)
(('finnan', 'haddie'), 22.25752141554633)
(('nitrous', 'oxide'), 22.25752141554633)
(('raymond', 'hom'), 22.25752141554633)
(('sabor', 'italia'), 22.25752141554633)
(('tara', 'donne'), 22.25752141554633)
(('bonny', 'doon'), 21.994487009712536)
(('fen', 'szu'), 21.994487009712536)
(('tic', 'tac'), 21.994487009712536)
(('tung', 'ku'), 21.994487009712536)
(('kung', 'pao'), 21.994487009712532)
(('gale', 'gand'), 21.77209458837609)
(('kari', 'patta'), 21.77209458837609)
(('tomric', 'plastics'), 21.77209458837609)
(('hahn', 'estates'), 21.772094588376085)
(('nobu-style', 'saikyo'), 21.772094588376085)
(('anita', 'calero'), 21.57944951043369)


From wikipedia:

> Ras el hanout or rass el hanout is a spice mix found in varying forms in Tunisia, Algeria, and Morocco. The name in Arabic means "head of the shop" and implies a mixture of the best spices the seller has to offer.

In [59]:
text = nltk.Text(filetokens)
text.concordance("bake")

Displaying 25 of 55466 matches:
a loaf . Place on top of the ketchup Bake in preheated oven for 1 hour or unti
 large spoonfuls onto ungreased pans Bake for about 10 minutes in the preheate
 . Top with the buttery bread crumbs Bake , uncovered , about 25 minutes or un
. Pour batter into prepared loaf pan Bake in preheated oven for 60 to 65 minut
 rimmed baking sheet to catch spills Bake in the preheated oven until bubbling
ned . Pour batter into prepared pans Bake for 40 to 60 minutes , or until test
nch apart on ungreased cookie sheets Bake 6 to 8 minutes in preheated oven . C
lended . Pour into the prepared pans Bake for about 50 minutes in the preheate
okies should be about 3 inches apart Bake for 15 to 17 minutes in the preheate
ish and sprinkle with Cheddar cheese Bake in preheated oven until cheese is me
 sprinkle over muffins before baking Bake for 20 to 25 minutes in the preheate
oonfuls onto ungreased cookie sheets Bake for 10 to 12 minutes in the preheate
e the foil does not 

In [32]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

# compare how the two stemmers work on a small portion of the tokens.

crimePstem = [porter.stem(t) for t in filetokens]
print(crimePstem[:200])

crimeLstem = [lancaster.stem(t) for t in filetokens]
print(crimeLstem[:200])


['place', 'the', 'chicken', ',', 'butter', ',', 'soup', ',', 'and', 'onion', 'in', 'a', 'slow', 'cooker', ',', 'and', 'fill', 'with', 'enough', 'water', 'to', 'cover', 'cover', ',', 'and', 'cook', 'for', '5', 'to', '6', 'hour', 'on', 'high', '.', 'about', '30', 'minut', 'befor', 'serv', ',', 'place', 'the', 'torn', 'biscuit', 'dough', 'in', 'the', 'slow', 'cooker', '.', 'cook', 'until', 'the', 'dough', 'is', 'no', 'longer', 'raw', 'in', 'the', 'center', '.', 'in', 'a', 'slow', 'cooker', ',', 'mix', 'cream', 'of', 'mushroom', 'soup', ',', 'dri', 'onion', 'soup', 'mix', 'and', 'water', '.', 'place', 'pot', 'roast', 'in', 'slow', 'cooker', 'and', 'coat', 'with', 'soup', 'mixtur', 'cook', 'on', 'high', 'set', 'for', '3', 'to', '4', 'hour', ',', 'or', 'on', 'low', 'set', 'for', '8', 'to', '9', 'hour', '.', 'preheat', 'oven', 'to', '350', 'degre', 'f', '(', '175', 'degre', 'c', ')', '.', 'lightli', 'greas', 'a', '5x9', 'inch', 'loaf', 'pan', 'press', 'the', 'brown', 'sugar', 'in', 'the', 'bo

In [33]:
wnl = nltk.WordNetLemmatizer()
crimeLemma = [wnl.lemmatize(t) for t in filetokens]
print(crimeLemma[:200])

['Place', 'the', 'chicken', ',', 'butter', ',', 'soup', ',', 'and', 'onion', 'in', 'a', 'slow', 'cooker', ',', 'and', 'fill', 'with', 'enough', 'water', 'to', 'cover', 'Cover', ',', 'and', 'cook', 'for', '5', 'to', '6', 'hour', 'on', 'High', '.', 'About', '30', 'minute', 'before', 'serving', ',', 'place', 'the', 'torn', 'biscuit', 'dough', 'in', 'the', 'slow', 'cooker', '.', 'Cook', 'until', 'the', 'dough', 'is', 'no', 'longer', 'raw', 'in', 'the', 'center', '.', 'In', 'a', 'slow', 'cooker', ',', 'mix', 'cream', 'of', 'mushroom', 'soup', ',', 'dry', 'onion', 'soup', 'mix', 'and', 'water', '.', 'Place', 'pot', 'roast', 'in', 'slow', 'cooker', 'and', 'coat', 'with', 'soup', 'mixture', 'Cook', 'on', 'High', 'setting', 'for', '3', 'to', '4', 'hour', ',', 'or', 'on', 'Low', 'setting', 'for', '8', 'to', '9', 'hour', '.', 'Preheat', 'oven', 'to', '350', 'degree', 'F', '(', '175', 'degree', 'C', ')', '.', 'Lightly', 'grease', 'a', '5x9', 'inch', 'loaf', 'pan', 'Press', 'the', 'brown', 'sugar',