In [2]:
import pandas as pd
import numpy as np

### Recipe Database

- vectorized operations are quite useful in cleaning up messy real-world data
- Our goal will be to parse the recipe data into ingredient lists to quickly find a recipe based on some ingredients we have on hand.

In [3]:
# Reading json format file
try:
    recipes = pd.read_json("data/recipeitems-latest.json")
except ValueError as e:
    print("Value Error:", e)

Value Error: Trailing data


- Trailing Data Error is due to using a file in which each line is itself a valid JSON, but the full file is not. Let’s check if this interpretation is true:

In [89]:
#it's true that each line is a valid json
with open("data/recipeitems-latest.json")as f:
    line = f.readline()
pd.read_json(line).shape

(2, 12)

In [13]:
# Constructing the string representation containing all these JSON entries
with open("data/recipeitems-latest.json")as f:
    data = (line.strip() for line in f)
    data_json = "[{0}]".format(",".join(data)) #Reformat so each line is the element of a list
recipes = pd.read_json(data_json)
recipes.head()

Unnamed: 0,_id,cookTime,creator,dateModified,datePublished,description,image,ingredients,name,prepTime,recipeCategory,recipeInstructions,recipeYield,source,totalTime,ts,url
0,{'$oid': '5160756b96cc62079cc2db15'},PT30M,,,2013-03-11,"Late Saturday afternoon, after Marlboro Man ha...",http://static.thepioneerwoman.com/cooking/file...,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,Drop Biscuits and Sausage Gravy,PT10M,,,12,thepioneerwoman,,{'$date': 1365276011104},http://thepioneerwoman.com/cooking/2013/03/dro...
1,{'$oid': '5160756d96cc62079cc2db16'},PT20M,,,2013-03-13,"When I was growing up, I participated in my Ep...",http://static.thepioneerwoman.com/cooking/file...,12 whole Dinner Rolls Or Small Sandwich Buns (...,Hot Roast Beef Sandwiches,PT20M,,,12,thepioneerwoman,,{'$date': 1365276013902},http://thepioneerwoman.com/cooking/2013/03/hot...
2,{'$oid': '5160756f96cc6207a37ff777'},,,,2013-01-07,A beauty of a carrot salad - tricked out with ...,http://www.101cookbooks.com/mt-static/images/f...,Dressing:\n1 tablespoon cumin seeds\n1/3 cup /...,Morrocan Carrot and Chickpea Salad,PT15M,,,,101cookbooks,,{'$date': 1365276015332},http://www.101cookbooks.com/archives/moroccan-...
3,{'$oid': '5160757096cc62079cc2db17'},PT15M,,,2013-03-18,It's Monday! It's a brand new week! The birds ...,http://static.thepioneerwoman.com/cooking/file...,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,Mixed Berry Shortcake,PT15M,,,8,thepioneerwoman,,{'$date': 1365276016700},http://thepioneerwoman.com/cooking/2013/03/mix...
4,{'$oid': '5160757496cc6207a37ff778'},,,,2013-01-20,A simple breakfast bowl made with Greek yogurt...,http://www.101cookbooks.com/mt-static/images/f...,For each bowl: \na big dollop of Greek yogurt\...,Pomegranate Yogurt Bowl,PT5M,,,Serves 1.,101cookbooks,,{'$date': 1365276020318},http://www.101cookbooks.com/archives/pomegrana...


In [14]:
recipes.shape

(173278, 17)

In [15]:
#take a look one row
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
cookTime                                                          PT30M
creator                                                             NaN
dateModified                                                        NaN
datePublished                                                2013-03-11
description           Late Saturday afternoon, after Marlboro Man ha...
image                 http://static.thepioneerwoman.com/cooking/file...
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
name                                    Drop Biscuits and Sausage Gravy
prepTime                                                          PT10M
recipeCategory                                                      NaN
recipeInstructions                                                  NaN
recipeYield                                                          12
source                                                  thepione

- It's in a messy form.In particular ingredients is in string format

In [25]:
#Extract the information about ingredients
recipes.ingredients[0]

'Biscuits\n3 cups All-purpose Flour\n2 Tablespoons Baking Powder\n1/2 teaspoon Salt\n1-1/2 stick (3/4 Cup) Cold Butter, Cut Into Pieces\n1-1/4 cup Butermilk\n SAUSAGE GRAVY\n1 pound Breakfast Sausage, Hot Or Mild\n1/3 cup All-purpose Flour\n4 cups Whole Milk\n1/2 teaspoon Seasoned Salt\n2 teaspoons Black Pepper, More To Taste'

In [28]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

- The ingredient lists average 250 characters long, with a minimum of 0 and a maximum of nearly 10,000 characters!

In [31]:
np.argmax(recipes.ingredients.str.len())

135598

In [40]:
#Viewing which recipe has the longest ingredient list
recipes.name[np.argmax(recipes.ingredients.str.len())]

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [65]:
#Viewing how many of the recipes are for breakfast food
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [66]:
#Viewing how many of the recipes list cinnamon as an ingredient
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

In [78]:
#Can even look to see whether any recipes misspell the ingredient as “cinamon”
recipes.ingredients.str.contains('[Cc]inamon').sum()

11

- So, This is the type of essential data exploration that is possible with pandas string tools.
- This is Data Munging process, meaning cleaning messy datasets.

#### A simple recipe recommender

- given a list of ingredients, find a recipe that uses all those ingredients.

In [80]:
import re

spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) for spice in spice_list))
spice_df.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [82]:
#finding a recipe that uses parsley, paprika, and tarragon.
selection = spice_df.query("parsley & paprika & tarragon")
len(selection)

10

In [83]:
selection.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
2069,False,True,False,False,True,False,True,False,True,False
74964,False,False,False,False,True,False,True,False,True,False
93768,True,True,False,True,True,False,True,False,True,False
113926,True,True,False,False,True,False,True,False,True,False
137686,True,True,False,False,True,False,True,False,True,False


In [85]:
recipes.name[selection.index]

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object