# Kaggle Recipes Data Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd

import ujson

import os
try:
    this_dir = os.path.dirname(os.path.realpath(__file__))
except:
    this_dir = os.getcwd()

In [2]:
# load data
with open(os.path.join(this_dir, 'full_format_recipes.json')) as f:
    recipes = ujson.load(f)
rdf = pd.DataFrame(recipes)
    
meta = pd.read_csv(os.path.join(this_dir, 'epi_r.csv'))

In [3]:
rdf.head()

Unnamed: 0,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01T04:00:00.000Z,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20T04:00:00.000Z,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20T04:00:00.000Z,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27T04:00:00.000Z,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20T04:00:00.000Z,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


In [4]:
meta.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Challenge 1: Ingredient Standardization

* Ingredients show up in lots of different ways within the recipes

### Standardization is Nontrivial

In [5]:
from collections import Counter
ings = []

key = 'ingredients'
import re

def standardize_ing(ing):
    ing = ing.strip()
    ing = ing.replace('0123456789/', '')
    ing = re.sub('\s+', ' ', ing)
    return ing

for r in recipes:
    if key not in r:
        continue
    for ing in r[key]:
        ings.append(ing)

count_ings = Counter(ings)
count_ings_std = Counter([standardize_ing(ing) for ing in ings])

#### How Many Ingredients Are There?

In [6]:
print('total unique ingredients: {}'.format(len(count_ings)))
print('total unique standardized ingredients: {}'.format(len(count_ings_std)))

total unique ingredients: 83465
total unique standardized ingredients: 83427


In [7]:
print('total number of recipes: {}'.format(len(rdf)))

total number of recipes: 20130


### Example: Lettuce Ingredients:
* Let's pick out every ingredient with 'lettuce' in the name

In [8]:
lings = []
search_str = 'lettuce'
key = 'ingredients'
for r in recipes:
    if key not in r:
        continue
    for ing in r[key]:
        if search_str in ing.lower():
            lings.append(ing)

In [9]:
print("First 10 lettuce variations:")
for ling in lings[:10]:
    print('  {}'.format(ling))

print()
print('Total: {} lettuce variations'.format(len(lings)))

First 10 lettuce variations:
  1/2 head Bibb lettuce
  Lettuce leaves
  6 lettuce leaves
  Accompaniments: butter lettuce or other soft-leaf lettuce; thinly sliced garlic; packaged kimchi*; steamed white rice
  Accompaniments: sandwich bread; lettuce leaves; sliced tomato
  1 head red leaf lettuce
  2/3 cup chopped red onion Romaine or butter lettuce leaves
  Accompaniments: diced avocado; crema; queso fresco; thinly sliced iceberg or romaine lettuce; chopped white onion; sliced radishes; fried tortilla strips or chips; lime wedges; dried oregano; dried hot red-pepper flakes
  1 small head green or red leaf lettuce
  Accompaniments: lettuce; sliced tomato

Total: 436 lettuce variations


#### Example Issues:
* Quantity Terms (e.g.: "2/3 cup", "1 small head", "1 _____ leaves")
* Preparation Terms (e.g.: "chopped", "dried")
* "or" clauses (e.g.: "butter, red leaf, or romaine lettuce")
* Other terms (e.g.: "Accompaniments")


## Metadata Exploration
* Recipes have reasonable metadata category frequencies

In [10]:
cols = meta.columns
cs = cols[6:]
print('num extra meta cols: {}'.format(len(cs)))

num extra meta cols: 674


In [11]:
cat_freqs = meta[cs].mean(axis=0)

In [12]:
pd.options.display.max_rows = 999

cat_freqs.sort_values(ascending=False)[:40]

bon appétit          0.466537
peanut free          0.418412
soy free             0.403351
tree nut free        0.351287
vegetarian           0.341412
gourmet              0.331538
kosher               0.307949
pescatarian          0.301317
quick & easy         0.267903
wheat/gluten-free    0.244664
bake                 0.220078
summer               0.207012
dessert              0.178187
dairy free           0.159884
side                 0.157141
no sugar added       0.156194
winter               0.154548
fall                 0.150359
dinner               0.134899
sugar conscious      0.122980
healthy              0.117245
kidney friendly      0.115350
onion                0.111610
tomato               0.106723
vegetable            0.104079
sauté                0.101935
milk/cream           0.099491
fruit                0.097646
vegan                0.092310
kid-friendly         0.089318
egg                  0.088171
spring               0.085528
herb                 0.083832
garlic    

## Challenge 2: Matching Ingredients to Nutrition Info
* Can get nutrition info:
  * https://ndb.nal.usda.gov/ndb/
  * https://catalog.data.gov/dataset/composition-of-foods-raw-processed-prepared-usda-national-nutrient-database-for-standard-r
  * https://catalog.data.gov/dataset/mypyramid-food-raw-data-f9ed6
  
Example ndb data:

In [14]:
nut_prods = pd.read_csv(os.path.join(this_dir, 'BFPD', 'Products.csv'))
nut_prods.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,NDB_Number,long_name,data_source,gtin_upc,manufacturer,date_modified,date_available,ingredients_english
0,45001524,MOCHI ICE CREAM BONBONS,LI,19022128593,"G. T. Japan, Inc.",2017-11-15 19:19:38,2017-11-15 19:19:38,"ICE CREAM INGREDIENTS: MILK, CREAM, SUGAR, STR..."
1,45001528,CHIPOTLE BARBECUE SAUCE,LI,5051379043735,FRESH & EASY,2018-04-26 17:23:31,2018-04-26 17:23:31,"WATER, SUGAR, TOMATO PASTE, MOLASSES, DISTILLE..."
2,45001529,HOT & SPICY BARBECUE SAUCE,LI,5051379009434,FRESH & EASY,2018-04-26 18:17:37,2018-04-26 18:17:37,"SUGAR, WATER, DISTILLED VINEGAR, TOMATO PASTE,..."
3,45001530,BARBECUE SAUCE,LI,5051379019969,FRESH & EASY,2018-04-26 17:24:00,2018-04-26 17:24:00,"TOMATO PUREE (WATER, TOMATO PASTE), SUGAR, DIS..."
4,45001531,BARBECUE SAUCE,LI,5051379009526,FRESH & EASY,2018-04-26 17:47:41,2018-04-26 17:47:41,"SUGAR, DISTILLED VINEGAR, WATER, TOMATO PASTE,..."


### Outstanding Questions:
* How to match parsed ingredient names to nutrition info
* How to determine basic ingredients vs. brand names