Comprehensive look (version 2) at the ingredients in JSON and in .csv ingredient seznam.
The output is a new JSON with added ingredients_individually column. 

In [1]:
import pandas as pd
import json
import numpy as np
import csv

In [2]:
from tqdm import tqdm
import inflect
from unidecode import unidecode

In [3]:
recipes_filename = "../../data/full_format_recipes.json"

In [4]:
df = pd.read_json(recipes_filename)
len(df)

20130

In [5]:
df.head(5)

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0


In [6]:
null_values_indexes = df[df['ingredients'].isnull() == True].index
null_values_indexes

Int64Index([ 1076,  1135,  1907,  5146,  5424,  5558,  7607,  7768,  7881,
             8177,  9590, 10085, 11224, 13206, 13944, 14684, 16210, 16903,
            19547],
           dtype='int64')

In [7]:
df = df.drop(index = null_values_indexes).reset_index(drop=True) #drop indexes with ingredient being nan

# JSON Ingredients

I took the first n ingredients from JSON (*ingredients*) and list of .csv ingredients (in *seznam*). 

I added space before and after each element in JSON ingredients, replaced (, \*, (, ), - and ;) all with space before and after each character, lowercased ingredient and converted to latin characters (e. g. crème fraîche to creme fraiche).

I added space before and after each element in seznam. The seznam was provided with plural form (being an output of last observations). The spaces are important because of ability to distinguish between e. g. *teaspoon* and *tea* (we want to find only *tea*).

Then I sorted the seznam based on length and alphabetically, for assuring that the longest found seznam element in json ingredient would be found.

In [8]:
p = inflect.engine() #plural forms engine

In [9]:
#first_n = 1000
first_n = len(df['ingredients'])
ingredients = [x[:] for x in df['ingredients'][:first_n]]
ingredients_individually = [x[:] for x in df['ingredients'][:first_n]]

for i, ingredient in enumerate(tqdm(ingredients)):
    for j, row_ingredient in enumerate(ingredient):
        ingredients[i][j] = ' ' + ingredients[i][j] + ' '
        ingredients[i][j] = ingredients[i][j].replace(",", " , ")
        ingredients[i][j] = ingredients[i][j].replace("*", " * ")
        ingredients[i][j] = ingredients[i][j].replace("(", " ( ")
        ingredients[i][j] = ingredients[i][j].replace(")", " ) ")
        ingredients[i][j] = ingredients[i][j].replace("-", " - ")
        ingredients[i][j] = ingredients[i][j].replace(";", " ; ")
        ingredients[i][j] = ingredients[i][j].lower()
        ingredients[i][j] = unidecode(ingredients[i][j])
        ingredients_individually[i][j] = 'UNKNOWN'

100%|█████████████████████████████████████████████████████████████████████████| 20111/20111 [00:01<00:00, 17574.83it/s]


In [10]:
ingredients[0]

[' 4 cups low - sodium vegetable or chicken stock ',
 ' 1 cup dried brown lentils ',
 ' 1/2 cup dried french green lentils ',
 ' 2 stalks celery ,  chopped ',
 ' 1 large carrot ,  peeled and chopped ',
 ' 1 sprig fresh thyme ',
 ' 1 teaspoon kosher salt ',
 ' 1 medium tomato ,  cored ,  seeded ,  and diced ',
 ' 1 small fuji apple ,  cored and diced ',
 ' 1 tablespoon freshly squeezed lemon juice ',
 ' 2 teaspoons extra - virgin olive oil ',
 ' freshly ground black pepper to taste ',
 ' 3 sheets whole - wheat lavash ,  cut in half crosswise ,  or 6  ( 12 - inch )  flour tortillas ',
 ' 3/4 pound turkey breast ,  thinly sliced ',
 ' 1/2 head bibb lettuce ']

In [11]:
#with open('../../data/seznam_all.csv', 'r') as f:
with open('../../data/seznam_noadj.csv', 'r') as f:
    reader = csv.reader(f)
    seznam = list(reader)
seznam = [''.join(x) for x in seznam] #make list of strings from list of lists
seznam = [x.strip() for x in seznam] #remove spaces before and after
seznam = [unidecode(x) for x in seznam] #remove diacritics
seznam = [' {0} '.format(elem) for elem in seznam] #add a space before and after a word
seznam = seznam[1:] #drop column name

#for i in range(len(seznam)):
#    seznam.append(p.plural(seznam[i]))
        
seznam.sort() # sorts normally by alphabetical order
seznam.sort(key=len, reverse=False)  # sorts by ascending length

len(seznam)  

2102

In [12]:
seznam[:10]

[' TVP ',
 ' ahi ',
 ' ale ',
 ' bow ',
 ' bun ',
 ' cod ',
 ' dal ',
 ' egg ',
 ' fat ',
 ' fat ']

In [13]:
for i, ingredient in enumerate(tqdm(ingredients)):
    for j, row_ingredient in enumerate(ingredient):
        for polozka in seznam:
            if polozka in row_ingredient:
                ingredients_individually[i][j] = polozka.strip()

100%|███████████████████████████████████████████████████████████████████████████| 20111/20111 [01:05<00:00, 304.83it/s]


### First check

In [14]:
ingredients[0]

[' 4 cups low - sodium vegetable or chicken stock ',
 ' 1 cup dried brown lentils ',
 ' 1/2 cup dried french green lentils ',
 ' 2 stalks celery ,  chopped ',
 ' 1 large carrot ,  peeled and chopped ',
 ' 1 sprig fresh thyme ',
 ' 1 teaspoon kosher salt ',
 ' 1 medium tomato ,  cored ,  seeded ,  and diced ',
 ' 1 small fuji apple ,  cored and diced ',
 ' 1 tablespoon freshly squeezed lemon juice ',
 ' 2 teaspoons extra - virgin olive oil ',
 ' freshly ground black pepper to taste ',
 ' 3 sheets whole - wheat lavash ,  cut in half crosswise ,  or 6  ( 12 - inch )  flour tortillas ',
 ' 3/4 pound turkey breast ,  thinly sliced ',
 ' 1/2 head bibb lettuce ']

In [15]:
ingredients_individually[0]

['chicken stock',
 'lentils',
 'lentils',
 'celery',
 'carrot',
 'thyme',
 'kosher salt',
 'tomato',
 'apple',
 'lemon juice',
 'olive oil',
 'pepper',
 'flour tortillas',
 'turkey breast',
 'bibb lettuce']

### New ingredients_individually column

In [16]:
df['ingredients_individually'] = np.nan
for j in tqdm(range(len(ingredients))):
    df['ingredients_individually'][j] = ingredients_individually[j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
100%|██████████████████████████████████████████████████████████████████████████| 20111/20111 [00:04<00:00, 4456.63it/s]


In [17]:
df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium,ingredients_individually
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0,"[chicken stock, lentils, lentils, celery, carr..."
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0,"[cream, onions, salt, bay leaves, cloves, garl..."
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0,"[fennel bulb, onion, butter, potatoes, chicken..."
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,"[olive oil, onion, wine, anchovy paste, tomato..."
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0,"[spinach, egg noodles, cream, pesto sauce, nut..."


### Find most frequent ingredients
This list will be fed to search engine

In [19]:
all_ingredients_list = []
for i in tqdm(range(len(df['ingredients_individually']))):
    all_ingredients_list = all_ingredients_list + df['ingredients_individually'][i]
    #all ingredients from seznam used in all recipes
all_ingredients = np.array(all_ingredients_list)
len(all_ingredients)

100%|███████████████████████████████████████████████████████████████████████████| 20111/20111 [00:58<00:00, 341.51it/s]


199030

In [20]:
unique, counts = np.unique(all_ingredients, return_counts=True)
unique_df = pd.DataFrame(data = zip(unique, counts), columns = ['Unique', 'Counts'])
unique_df = unique_df.sort_values(by = 'Counts', ascending = False)
print(len(unique))

1914


In [21]:
unique_df.head(50)

Unnamed: 0,Unique,Counts
1661,sugar,8086
1091,olive oil,7176
207,butter,6545
1490,salt,6510
1210,pepper,4483
1862,water,3916
1095,onion,3858
696,garlic cloves,3379
668,flour,3362
527,cream,3358


In [22]:
n_most_frequent = 100
most_frequent_ingredients = unique_df['Unique'][:n_most_frequent].values.astype(str)
most_frequent_ingredients

array(['sugar', 'olive oil', 'butter', 'salt', 'pepper', 'water', 'onion',
       'garlic cloves', 'flour', 'cream', 'lemon juice', 'kosher salt',
       'UNKNOWN', 'garlic', 'vegetable oil', 'eggs', 'onions', 'ginger',
       'parsley', 'milk', 'chicken broth', 'wine', 'cinnamon',
       'lime juice', 'tomatoes', 'vanilla extract', 'thyme', 'cilantro',
       'wine vinegar', 'potatoes', 'powder', 'bell pepper', 'egg',
       'carrots', 'soy sauce', 'dijon mustard', 'honey', 'chocolate',
       'shallots', 'cumin', 'celery', 'orange juice', 'egg yolks',
       'rosemary', 'mayonnaise', 'oregano', 'lemon', 'vanilla', 'almonds',
       'lemon peel', 'chives', 'shallot', 'yogurt', 'nutmeg',
       'parmesan cheese', 'scallions', 'basil', 'parsley leaves',
       'cornstarch', 'bacon', 'lemon zest', 'peppercorns', 'soda',
       'balsamic vinegar', 'cloves', 'pepper flakes', 'jalapeno',
       'paprika', 'shrimp', 'granulated sugar', 'mushrooms', 'beans',
       'bay leaf', 'plum tomatoes'

### Find and delete UNKNOWN ingredient

The same (or very similar) applies for all other 2957 UNKNOWN entries. UNKNOWN sample:

In [18]:
for i, ingredient in enumerate(tqdm(df['ingredients_individually'][:50])):
    for j, row_ingredient in enumerate(ingredient):
        if 'UNKNOWN' in row_ingredient:           
            print("Recipe: {}, Row: {}, \n full name: {}, \n individually: {} \n"
                  .format(i, j, df['ingredients'][i][j], df['ingredients_individually'][i][j]))

  0%|                                                                                           | 0/50 [00:00<?, ?it/s]

Recipe: 9, Row: 15, 
 full name: Equipment: 4 (16-ounce) wide jars or containers with lids, 
 individually: UNKNOWN 

Recipe: 17, Row: 14, 
 full name: Special equipment: a wide 5- to 6-qt heavy pot with a tight-fitting lid, 
 individually: UNKNOWN 

Recipe: 20, Row: 11, 
 full name: *Available at some Asian markets and by mail order from Ethnic Grocer (800-523-1961)., 
 individually: UNKNOWN 

Recipe: 22, Row: 0, 
 full name: 4 15x15-inch squares parchment paper, 
 individually: UNKNOWN 

Recipe: 31, Row: 3, 
 full name: a 4- to 6-cup jar with a tight-fitting lid, 
 individually: UNKNOWN 

Recipe: 43, Row: 8, 
 full name: *available at specialty produce markets, 
 individually: UNKNOWN 



100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 1786.80it/s]


In [19]:
df['ingredients_individually'][372]

['coconut milk', 'yogurt', 'mint leaves', 'coconut', 'UNKNOWN', 'UNKNOWN']

In [29]:
for i, ingredient in enumerate(tqdm(df['ingredients_individually'])):
    for j, row_ingredient in enumerate(ingredient):
        if 'UNKNOWN' in row_ingredient:
            del df['ingredients_individually'][i][j] #delete row in ingredient, where was no match
            #print("Recipe: {}, Row: {}, \n full name: {}, \n individually: {} \n"
            #      .format(i, j, df['ingredients'][i][j], df['ingredients_individually'][i][j]))

100%|████████████████████████████████████████████████████████████████████████| 20111/20111 [00:00<00:00, 268296.61it/s]


In [30]:
df['ingredients_individually'][372]

['coconut milk', 'yogurt', 'mint leaves', 'coconut']

In [31]:
df['ingredients'][19924]

['1 pound fresh linguine',
 '2 tablespoons olive oil',
 '10 tablespoons (1 1/4 sticks) butter, divided',
 '1/2 cup all purpose flour',
 '4 cups warm milk',
 'Dash of ground nutmeg',
 '1 teaspoon white truffle powder* or truffle flour,* or truffle oil to taste',
 '*Available at specialty foods stores and Italian markets, and online from buonitalia.com.',
 '**Available in the Asian foods section of some supermarkets and at Asian markets.',
 '1/2 cup panko (Japanese breadcrumbs)**',
 '*Available at specialty foods stores and Italian markets, and online from buonitalia.com.',
 '**Available in the Asian foods section of some supermarkets and at Asian markets.']

In [32]:
df['ingredients_individually'][19924]

['linguine',
 'olive oil',
 'butter',
 'flour',
 'milk',
 'nutmeg',
 'truffle oil',
 'breadcrumbs']

In [33]:
for i, ingredient in enumerate(tqdm(df['ingredients_individually'])):
    for j, row_ingredient in enumerate(ingredient):
        if 'UNKNOWN' in row_ingredient:           
            print("Recipe: {}, Row: {}, \n full name: {}, \n individually: {} \n"
                  .format(i, j, df['ingredients'][i][j], df['ingredients_individually'][i][j]))
            #there is no UNKNOWN in the df anymore

100%|████████████████████████████████████████████████████████████████████████| 20111/20111 [00:00<00:00, 170428.37it/s]


### Final checks

In [34]:
import random
#i = random.randint(0, len(df))
i = random.randint(0, len(ingredients))

print(df['ingredients'][i], '\n')
print(df['ingredients_individually'][i])

['1 large navel orange with skin', '7 cups water, divided', '1/2 cup sugar', '2 cinnamon sticks, divided', '1/4 cup (packed) golden brown sugar', '1 tablespoon dark rum', '2 teaspoons apple cider vinegar', '1/8 teaspoon ground allspice', '1/2 cup fresh or frozen cranberries'] 

['orange', 'water', 'sugar', 'cinnamon sticks', 'sugar', 'rum', 'apple cider vinegar', 'allspice', 'cranberries']


In [35]:
import random
i = random.randint(0, len(df))
i = random.randint(0, len(ingredients))

print(df['ingredients'][i], '\n')
print(df['ingredients_individually'][i])

['1/2 cup mayonnaise', '1 tablespoon finely chopped canned chipotle chiles in adobo plus 2 teaspoons adobo sauce (from can)', '1/4 teaspoon fresh lemon juice', '1/2 lb sea scallops, tough muscle removed from side of each if necessary', '1 large egg white', '1 tablespoon chopped shallot', '1/2 teaspoon salt', '1/8 teaspoon black pepper', '1/3 cup chilled heavy cream', '1/2 lb shrimp in shell, peeled, deveined, and cut into 1/4-inch pieces', '1 cup plain fine dry bread crumbs', 'About 1/2 cup vegetable oil', 'Accompaniment: lemon wedges'] 

['mayonnaise', 'chipotle chiles', 'lemon juice', 'scallops', 'egg', 'shallot', 'salt', 'pepper', 'cream', 'shrimp', 'bread crumbs', 'vegetable oil', 'lemon']


In [36]:
import random
#i = random.randint(0, len(df))
i = random.randint(0, len(ingredients))

print(df['ingredients'][i], '\n')
print(df['ingredients_individually'][i])

['3 tablespoons soy sauce', '2 tablespoons seasoned rice vinegar', '1 1/2 tablespoons medium-dry Sherry', '3/4 pound flank steak', '1 large garlic clove, minced', '2 teaspoons grated peeled fresh gingerroot', '1/2 pound mushrooms', '3 scallions', '1 small cucumber', '2 tablespoons vegetable oil', '1 cup fresh bean sprouts', '2 tablespoons water', '2 cups packed spinach leaves, washed well and spun dry'] 

['soy sauce', 'rice vinegar', 'sherry', 'flank steak', 'garlic', 'gingerroot', 'mushrooms', 'scallions', 'cucumber', 'vegetable oil', 'sprouts', 'water', 'spinach leaves']


## Save dataframe with column of found ingredients

In [38]:
recipes_filename_parsed_ingredients = "../../data/full_format_recipes_parsed_ingredients.json"
df.to_json(recipes_filename_parsed_ingredients)