### Data preprocessing

We will take the receipe json file created from data scraping (recipe_scrapper.py)
- Load the json file into pandas dataframe
- Data cleaning is performed on the unstructured data
- After the data has been structures in the way we need it. I have pickeled the dataframes for use in other modules

In [1]:
import os
import json
import pickle
import time 

from PIL import Image
import io
import urllib.request

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 

from functools import reduce

from tqdm import tqdm
tqdm.pandas()   # this is for progress_apply()

  from pandas import Panel


In [2]:
#Caution: 
""" As I was appending json list in the scrapper script. You have to watch out for },{ and replace it with , 
    To get a better understanding check the save_recipes method of the scrapper
    Since the json file is too big, text editors will crash. Use visual studio and replace the strings and also take care of
    the , at the last"""
with open('recipes_epicurious.json') as f:
    data = json.load(f)

In [3]:
list(data.items())[0]

('http://www.epicurious.com/recipes/food/views/okonomiyaki-as-you-like-it-pancakes-with-bonito-flakes',
 {'title': 'Okonomiyaki (“As You Like It”) Pancakes With Bonito Flakes ',
  'ingredients': ['1 ½ cups (180 g) all-purpose flour',
   '1 teaspoon baking powder',
   '¼ teaspoon sea salt',
   '1 large egg, beaten',
   '1¼ cups (296 ml) whole milk or milk of your choice',
   '8 ounces (230 g) cabbage, thinly sliced',
   '2 scallions, white and light green parts chopped',
   '½ yellow, green, or red bell pepper, thinly sliced',
   '4 tablespoons vegetable oil',
   '8 ounces (230 g) boneless chicken, shrimp, crab, or sukiyaki-style beef or pork, cut into ½-inch (12 mm) pieces',
   '2 tablespoons mayonnaise',
   '2 tablespoon or more homemade or store-bought Tonkatsu Sauce',
   '½ cup (4 g) bonito flakes',
   '½ cup (4 g) crumbled nori'],
  'instructions': 'Whisk together the flour, baking powder, and salt in a small bowl.In a medium bowl, whisk the egg and milk. Add the flour mixture and 

In [4]:
# Collect the keys of (top-level) dictionary
dict_keys = [each for each in data.keys()]

# Create a dataframe for dictionaryb-keys
df_dictkeys = pd.DataFrame(dict_keys, columns=['url'])

In [5]:
#Normalize semi-structured JSON data into a flat table. json_normalize returns a dataframe.
data = [json_normalize(data[each]) for each in dict_keys]

In [6]:
df = pd.DataFrame(data[0])
for i in data[1:]:
    df = df.append(i)

In [7]:
df.tail()

Unnamed: 0,title,ingredients,instructions,picture_link,tags
0,Minted Asparagus Frittata,"[4 large eggs, 1/3 cup ricotta cheese, 1 1/2 t...",Preheat broiler. Whisk first 5 ingredients and...,,"[Egg, Herb, Vegetable, Vegetarian, Quick & Eas..."
0,Bourbon Whipped Cream,"[1 liquid cup heavy cream, cold, 1/2 teaspoon ...","In a large mixer bowl, place all the ingredien...",,"[Condiment/Spread, Bourbon, Milk/Cream, Mixer,..."
0,Scallop and Bacon Chowder,"[1 cup (packed) fresh Italian parsley, 3/4 cup...","Blend parsley, oil, and salt in blender until ...",,"[Soup/Stew, Pork, Potato, Shellfish, Appetizer..."
0,Cranberry Walnut Tart,"[Sweet pastry dough, 3 large eggs, 2/3 cup pac...",Roll out dough into a 13-inch round (1/8 inch ...,,"[Berry, Nut, Dessert, Bake, Thanksgiving, Cran..."
0,Grilled Asian-Style Salmon with Cabbage and Mi...,"[1 cup (packed) fresh mint leaves, 2 tablespoo...",Prepare barbecue (medium-high heat). Thinly sl...,,"[Ginger, Low/No Sugar, Mint, Salmon, Summer, G..."


In [8]:
#merging the dataframes across index
df = pd.DataFrame(np.c_[df, df_dictkeys], 
                   columns = df.columns.tolist() + df_dictkeys.columns.tolist())

In [9]:
df.tail()

Unnamed: 0,title,ingredients,instructions,picture_link,tags,url
35967,Minted Asparagus Frittata,"[4 large eggs, 1/3 cup ricotta cheese, 1 1/2 t...",Preheat broiler. Whisk first 5 ingredients and...,,"[Egg, Herb, Vegetable, Vegetarian, Quick & Eas...",http://www.epicurious.com/recipes/food/views/m...
35968,Bourbon Whipped Cream,"[1 liquid cup heavy cream, cold, 1/2 teaspoon ...","In a large mixer bowl, place all the ingredien...",,"[Condiment/Spread, Bourbon, Milk/Cream, Mixer,...",http://www.epicurious.com/recipes/food/views/b...
35969,Scallop and Bacon Chowder,"[1 cup (packed) fresh Italian parsley, 3/4 cup...","Blend parsley, oil, and salt in blender until ...",,"[Soup/Stew, Pork, Potato, Shellfish, Appetizer...",http://www.epicurious.com/recipes/food/views/s...
35970,Cranberry Walnut Tart,"[Sweet pastry dough, 3 large eggs, 2/3 cup pac...",Roll out dough into a 13-inch round (1/8 inch ...,,"[Berry, Nut, Dessert, Bake, Thanksgiving, Cran...",http://www.epicurious.com/recipes/food/views/c...
35971,Grilled Asian-Style Salmon with Cabbage and Mi...,"[1 cup (packed) fresh mint leaves, 2 tablespoo...",Prepare barbecue (medium-high heat). Thinly sl...,,"[Ginger, Low/No Sugar, Mint, Salmon, Summer, G...",http://www.epicurious.com/recipes/food/views/g...


#### Titles cleaning

In [10]:
#cleaning


# Replace None with zeros (0), so we can locate those easily
df.fillna(value=0, inplace=True)

# Fix the index
df.reset_index(inplace=True, drop=True)
               
# Remove observations where its all zeros
df = df[df.title!=0]

# Remove observations where it has no title
df= df[df.title != '']

In [11]:
# Create column that describes the number of words in title
# progress_apply is supported by tqdm for pandas
# I used it because the data is huge and knowning about the progress made sense for the use case.
df['title_num'] = df['title'].progress_apply(lambda sent: len(sent.strip().split(' ')))

100%|████████████████████████████████████████████████████████████████████████| 35965/35965 [00:00<00:00, 274485.95it/s]


In [12]:

# Sort by title length
df.sort_values(by='title_num', ascending=False).head(3)

Unnamed: 0,title,ingredients,instructions,picture_link,tags,url,title_num
26267,Old-Fashioned Carrot Cake with Maple&#045cream...,[],,0,[],http://www.epicurious.com/recipes/food/views/o...,15014
29351,Smoked Salmon with Mustard&#045dill Sauce </h1...,[],,0,[],http://www.epicurious.com/recipes/food/views/s...,10853
27768,Patatine e Carciofi Arrosto\n r...,"[1 tablespoon finely grated fresh lemon zest, ...",In a bowl stir together gremolata ingredients....,0,"[Herb, Potato, Side, Roast, Passover, Vegetari...",http://www.epicurious.com/recipes/food/views/p...,23


In [13]:
# Replace ''\n' with a space
df['title']= df['title'].str.replace('\n', ' ')
# Split multiple spaces 
df['title'] = df['title'].progress_apply(lambda sent: sent.strip().split(' '))

100%|████████████████████████████████████████████████████████████████████████| 35965/35965 [00:00<00:00, 235027.09it/s]


In [14]:
def stripChar(name):
    '''function returns name without trailing characters in listChars
    Input:
        name      - name to strip
        listChars - list of characters (e.g., ['*','+','-'])
    '''
    listChars = ['*','-','+',',','.']
    # Use RECURSION to strip trailing characters in listChars (*, -, +)
    if len(name) ==0:
        newname=name
    elif name[-1] in listChars:
        newname = name[:-1]
        newname = stripChar(newname)
    else:
        newname = name
    return newname

In [15]:

# Filter out spaces from the list 
df['title'] = df['title'].progress_apply(lambda sent: [stripChar(w) for w in sent ])
# Rejoining title
df['title'] = df['title'].progress_apply(lambda words: ' '.join(words))
# split based on space ' '
df['title_num'] = df['title'].progress_apply(lambda sent: len(sent.split(' ')))

100%|████████████████████████████████████████████████████████████████████████| 35965/35965 [00:00<00:00, 161629.48it/s]
100%|████████████████████████████████████████████████████████████████████████| 35965/35965 [00:00<00:00, 370681.62it/s]
100%|████████████████████████████████████████████████████████████████████████| 35965/35965 [00:00<00:00, 255938.13it/s]


### Collect images based on picture_link column

In [16]:
# Check which observations have images?

dfpic = df[(df.picture_link !=np.NaN)].reset_index(drop=True)
print('Only', dfpic.shape[0], 'recipes actually have photos\n')

Only 35965 recipes actually have photos



In [17]:
# Function to display the images
def showPicture(df_in, index):
    '''Function returns image, given dataframe and index
    Input:
        df_in .  - dataframe to access
        index    - index of dataframe to show
    '''
    if df_in[df_in.index==index].picture_link.values[0] !=0:
        fname= df_in[df_in.index==index].picture_link.values[0].split('/')[-1]
        with open(f'../data/images/epicurious_images/{fname}', 'rb') as fin:
            image_file = io.BytesIO(fin.read())
        output = Image.open(image_file)
        print(f'{df_in[df_in.index==index].title.values[0]}')
    else:
        with open(f'../data/images/epicurious_images/empty.jpg', 'rb') as fin:
            image_file = io.BytesIO(fin.read())
        output = Image.open(image_file)
        print('No image for this recipe')
    return output

In [18]:
# Create index as id
df = df.reset_index()

# rename this index into `id`
df.rename(columns = {'index':'id'}, inplace=True)

In [19]:
df.head()

Unnamed: 0,id,title,ingredients,instructions,picture_link,tags,url,title_num
0,0,Okonomiyaki (“As You Like It”) Pancakes With B...,"[1 ½ cups (180 g) all-purpose flour, 1 teaspoo...","Whisk together the flour, baking powder, and s...",https://assets.epicurious.com/photos/5dfbeb3a8...,"[Pancake, Dinner, Breakfast, Cabbage, Green On...",http://www.epicurious.com/recipes/food/views/o...,9
1,1,Paneer Butter Masala,"[Canola oil, 1 pound hard paneer, cut into ¾-i...",Put a tablespoon of oil into a large lidded fr...,https://assets.epicurious.com/photos/5df7c9efa...,"[cookbooks, Vegetarian, Ginger, Garlic, Tomato...",http://www.epicurious.com/recipes/food/views/p...,3
2,2,Pesto Pasta Frittata,"[8 large eggs, ¼ cup whole milk or cream, ½ cu...",Set a rack in the upper third of the oven and ...,https://assets.epicurious.com/photos/5dfa5448a...,"[cookbooks, Frittata, Egg, Milk/Cream, Parmesa...",http://www.epicurious.com/recipes/food/views/p...,3
3,3,Salmon Confit with Lime Juniper and Fennel,"[2 limes, ½ teaspoon fine sea salt, plus more ...",Heat the oven to 325°F. Finely grate the zest ...,https://assets.epicurious.com/photos/5dfa62b17...,"[cookbooks, Salmon, Olive Oil, Lime, Fennel, T...",http://www.epicurious.com/recipes/food/views/s...,7
4,4,Coconut-Braised Chickpeas with Sweet Potatoes ...,"[½ cup slivered almonds, 2 tablespoons coconut...","Toast the almonds in a small, dry skillet set ...",https://assets.epicurious.com/photos/5dfa5857e...,"[cookbooks, Curry, Coconut]",http://www.epicurious.com/recipes/food/views/c...,7


In [20]:

# Create subset of df_INGREDIENTS for exploratory analysis 
df_title = df[['id','title','title_num','picture_link']]
df_ingredients = df[['id','title','ingredients', 'tags',
                     'picture_link','instructions','url']]



In [21]:
# Pickle sections
with open('df_epi_title.pkl','wb') as fout:
    pickle.dump(df_title, fout)

with open('df_epi_ingred.pkl','wb') as fout:
    pickle.dump(df_ingredients, fout)
    
with open('df_epi_cleaner.pkl','wb') as fout:
    pickle.dump(df, fout)