In [27]:
import json
from pprint import pprint
import numpy as np
import pandas as pd

## Epicurious

In [28]:
# load Epicurious Kaggle data
with open('epirecipes/full_format_recipes.json', 'r') as f:
    epicurious = json.loads(f.read())

# check an entry
pprint(epicurious[0])

{'calories': 426.0,
 'categories': ['Sandwich',
                'Bean',
                'Fruit',
                'Tomato',
                'turkey',
                'Vegetable',
                'Kid-Friendly',
                'Apple',
                'Lentil',
                'Lettuce',
                'Cookie'],
 'date': '2006-09-01T04:00:00.000Z',
 'desc': None,
 'directions': ['1. Place the stock, lentils, celery, carrot, thyme, and salt '
                'in a medium saucepan and bring to a boil. Reduce heat to low '
                'and simmer until the lentils are tender, about 30 minutes, '
                'depending on the lentils. (If they begin to dry out, add '
                'water as needed.) Remove and discard the thyme. Drain and '
                'transfer the mixture to a bowl; let cool.',
                '2. Fold in the tomato, apple, lemon juice, and olive oil. '
                'Season with the pepper.',
                '3. To assemble a wrap, place 1 lavash sheet 

In [29]:
type(epicurious)

list

In [30]:
# get it into a dataframe, check columns
epidf = pd.DataFrame(epicurious)
epidf.columns

Index(['directions', 'fat', 'date', 'categories', 'calories', 'desc',
       'protein', 'rating', 'title', 'ingredients', 'sodium'],
      dtype='object')

In [31]:
# only keep title, ingredients, and directions renamed to method
epidf = epidf[['title', 'directions', 'ingredients']]
epidf.rename(columns={'directions': 'method'}, inplace=True)

# join method to single string
epidf['method'] = epidf['method'].str.join(' ')

# check it out
print(epidf.info())
epidf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        20111 non-null  object
 1   method       20111 non-null  object
 2   ingredients  20111 non-null  object
dtypes: object(3)
memory usage: 471.9+ KB
None


Unnamed: 0,title,method,ingredients
0,"Lentil, Apple, and Turkey Wrap","1. Place the stock, lentils, celery, carrot, t...","[4 cups low-sodium vegetable or chicken stock,..."
1,Boudin Blanc Terrine with Red Onion Confit,Combine first 9 ingredients in heavy medium sa...,"[1 1/2 cups whipping cream, 2 medium onions, c..."
2,Potato and Fennel Soup Hodge,In a large heavy saucepan cook diced fennel an...,"[1 fennel bulb (sometimes called anise), stalk..."
3,Mahi-Mahi in Tomato Olive Sauce,Heat oil in heavy large skillet over medium-hi...,"[2 tablespoons extra-virgin olive oil, 1 cup c..."
4,Spinach Noodle Casserole,Preheat oven to 350°F. Lightly grease 8x8x2-in...,"[1 12-ounce package frozen spinach soufflé, th..."


## Recipe Box

Each json file from the Recipe Box undergoes the same treatment:
- to dataframe
- keep/reorder important data
- rename instructions/directions to method
- remove newline characters from method

#### Also from Epicurious

In [32]:
with open('recipes_raw/recipes_raw_nosource_epi.json', 'r') as f:
    epicurious = json.loads(f.read())

In [33]:
epirb = pd.DataFrame([x for x in epicurious.values()])
epirb = epirb[['title', 'instructions', 'ingredients']]
epirb.rename(columns={'instructions': 'method'}, inplace=True)
epirb['method'] = epirb['method'].replace(r'\n', '')

#### from AllRecipes

In [34]:
with open('recipes_raw/recipes_raw_nosource_ar.json', 'r') as f:
    allrecipes = json.loads(f.read())

In [35]:
arrb = pd.DataFrame([x for x in allrecipes.values()])
arrb = arrb[['title', 'instructions', 'ingredients']]
arrb.rename(columns={'instructions': 'method'}, inplace=True)
arrb['method'] = arrb['method'].replace(r'\n', '')

#### from Food Network

In [36]:
with open('recipes_raw/recipes_raw_nosource_fn.json', 'r') as f:
    fnet = json.loads(f.read())

In [37]:
fnrb = pd.DataFrame([x for x in fnet.values()])
fnrb = fnrb[['title', 'instructions', 'ingredients']]
fnrb.rename(columns={'instructions': 'method'}, inplace=True)
fnrb['method'] = fnrb['method'].replace(r'\n', '')

#### Combine Recipe Box Dataframes

In [38]:
rbdf = pd.concat([epirb, arrb, fnrb])
rbdf.head()

Unnamed: 0,title,method,ingredients
0,Christmas Eggnog,"Beat the egg whites until stiff, gradually add...","[12 egg whites, 12 egg yolks, 1 1/2 cups sugar..."
1,"Veal, Carrot and Chestnut Ragoût",Preheat oven to 400°F. Using small sharp knife...,"[18 fresh chestnuts, 2 1/2 pounds veal stew me..."
2,Caramelized Bread Pudding with Chocolate and C...,Preheat the oven to 350°F. Spread the softened...,"[2 tablespoons unsalted butter, softened, 4 or..."
3,Sherried Stilton and Green Peppercorn Spread,"In a food processor blend the Stilton, the cre...","[3/4 pound Stilton, crumbled (about 3 cups) an..."
4,Almond-Chocolate Macaroons,Position rack in center of oven and preheat to...,"[2 cups (about 9 1/2 ounces) whole almonds, to..."


## Recipe 1M Dataset

In [39]:
def val_extractor(list_of_dicts, key='text'):
    '''returns values from a list of dicts as a list of strings for the Recipe1M dataset'''
    return [x.get(key) for x in list_of_dicts]

def recipe_formatter(dictionary_entry):
    '''formats recipes from Recipe 1M to Recipe Box structure'''
    dictionary_entry['ingredients'] = val_extractor(dictionary_entry['ingredients'])
    dictionary_entry['instructions'] = val_extractor(dictionary_entry['instructions'])
    return dictionary_entry

In [40]:
with open('recipe1M/layer1.json', 'r') as f:
    r1M = json.loads(f.read())

In [41]:
# restructure to match other dataframes
for i in r1M:
    recipe_formatter(i)

In [42]:
rmdf = pd.DataFrame(r1M)
rmdf = rmdf[['title', 'url', 'instructions', 'ingredients']]
rmdf.rename(columns={'instructions': 'method'}, inplace=True)

# join method to single string
rmdf['method'] = rmdf['method'].str.join(' ')

print(rmdf.info())
rmdf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029720 entries, 0 to 1029719
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   title        1029720 non-null  object
 1   url          1029720 non-null  object
 2   method       1029720 non-null  object
 3   ingredients  1029720 non-null  object
dtypes: object(4)
memory usage: 31.4+ MB
None


Unnamed: 0,title,url,method,ingredients
0,Worlds Best Mac and Cheese,http://www.epicurious.com/recipes/food/views/-...,Preheat the oven to 350 F. Butter or oil an 8-...,"[6 ounces penne, 2 cups Beechers Flagship Chee..."
1,Dilly Macaroni Salad Recipe,http://cookeatshare.com/recipes/dilly-macaroni...,Cook macaroni according to package directions;...,"[1 c. elbow macaroni, 1 c. cubed American chee..."
2,Gazpacho,http://www.foodnetwork.com/recipes/gazpacho1.html,Add the tomatoes to a food processor with a pi...,"[8 tomatoes, quartered, Kosher salt, 1 red oni..."
3,Crunchy Onion Potato Bake,http://www.food.com/recipe/crunchy-onion-potat...,Preheat oven to 350 degrees Fahrenheit. Spray ...,"[2 12 cups milk, 1 12 cups water, 14 cup butte..."
4,Cool 'n Easy Creamy Watermelon Pie,http://www.food.com/recipe/cool-n-easy-creamy-...,Dissolve Jello in boiling water. Allow to cool...,"[1 (3 ounce) package watermelon gelatin, 14 cu..."


## Concatenate Dataframes

In [43]:
recipes = pd.concat([epidf, rbdf, rmdf])

print(recipes.info())
recipes.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1175014 entries, 0 to 1029719
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   title        1174426 non-null  object
 1   method       1174304 non-null  object
 2   ingredients  1174478 non-null  object
 3   url          1029720 non-null  object
dtypes: object(4)
memory usage: 44.8+ MB
None


Unnamed: 0,title,method,ingredients,url
0,"Lentil, Apple, and Turkey Wrap","1. Place the stock, lentils, celery, carrot, t...","[4 cups low-sodium vegetable or chicken stock,...",
1,Boudin Blanc Terrine with Red Onion Confit,Combine first 9 ingredients in heavy medium sa...,"[1 1/2 cups whipping cream, 2 medium onions, c...",
2,Potato and Fennel Soup Hodge,In a large heavy saucepan cook diced fennel an...,"[1 fennel bulb (sometimes called anise), stalk...",
3,Mahi-Mahi in Tomato Olive Sauce,Heat oil in heavy large skillet over medium-hi...,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,Spinach Noodle Casserole,Preheat oven to 350°F. Lightly grease 8x8x2-in...,"[1 12-ounce package frozen spinach soufflé, th...",


## Drop Nulls from title, method, and ingredients

In [44]:
recipes.isnull().sum()

title             588
method            710
ingredients       536
url            145294
dtype: int64

In [45]:
# some values are empty strings, not explicitly designated automatically as nulls until output and input cycle
recipes.replace('', np.nan, inplace=True)
recipes.isnull().sum()

title             588
method            746
ingredients       536
url            145294
dtype: int64

In [46]:
# drop nulls
recipes.dropna(subset=['title', 'method', 'ingredients'], inplace=True)
recipes.isnull().sum()

title               0
method              0
ingredients         0
url            144548
dtype: int64

## Save to CSV

In [47]:
# !mkdir data
recipes.to_csv('data/dirty_recipes.csv', index=False)