## Pre-processing of Recipe data
- Load in JSON file into pandas dataframe

Import Libraries

In [None]:
import os
import json
import pickle
import time 

from PIL import Image
import io
import urllib.request

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 

from functools import reduce

from tqdm import tqdm
tqdm.pandas()

In [None]:
# Open JSON
with open('../data/recipes_raw_epi.json') as fin:
    data = json.load(fin)

In [None]:
# Collect the keys of (top-level) dictionary
dict_keys = [each for each in data.keys()]

# Create a dataframe for dictionaryb-keys
df_dictkeys = pd.DataFrame(dict_keys, columns=['url'])

In [None]:
# Convert (dictionary of dictionary) into (list of dataframe)
# So that we loop through each dictionary easily by indexing it
data = [json_normalize(data[each]) for each in dict_keys]

In [None]:
# Then, append each into the dataframe
df = data[0]
for each in data[1:]:
    df = df.append(each)
    
# >>>> The lines (ABOVE) will take a while to complete <<<

In [None]:
# add URL info to the dataframe

df = pd.DataFrame(np.c_[df, df_dictkeys], 
                   columns = df.columns.tolist() + df_dictkeys.columns.tolist()) 

### Do some cleaning

In [None]:
# Replace None with zeros (0), so we can locate those easily
df.fillna(value=0, inplace=True)

# Fix the index
df.reset_index(inplace=True, drop=True)
               
# Remove observations where its all zeros
df = df[df.title!=0]

# Remove observations where it has no title
df= df[df.title != '']

### Clean titles

In [None]:
# Create column that describes the number of words in title
df['title_numWords'] = df['title'].progress_apply(lambda sent: len(sent.strip().split(' ')))

In [None]:
# Sort by title length
df.sort_values(by='title_numWords', ascending=False).head(3)

In [None]:
# Drop observations index 25963, 29045 (see above); they arent parsed correctly
df.drop(index=25974, inplace=True)
df.drop(index=29056, inplace=True)

In [None]:
# Replace ''\n' with a space
df['title']= df['title'].str.replace('\n', ' ')
# Split multiple spaces 
df['title'] = df['title'].progress_apply(lambda sent: sent.strip().split(' '))

In [None]:
# Function to remove punctuation
def stripChar(name):
    '''function returns name without trailing characters in listChars
    Input:
        name      - name to strip
        listChars - list of characters (e.g., ['*','+','-'])
    '''
    listChars = ['*','-','+',',','.']
    # Use RECURSION to strip trailing characters in listChars (*, -, +)
    if len(name) ==0:
        newname=name
    elif name[-1] in listChars:
        newname = name[:-1]
        newname = stripChar(newname)
    else:
        newname = name
    return newname

In [None]:
# Filter out spaces from the list 
df['title'] = df['title'].progress_apply(lambda sent: [stripChar(w) for w in sent ])
# Rejoining title
df['title'] = df['title'].progress_apply(lambda words: ' '.join(words))
# split based on space ' '
df['title_numWords'] = df['title'].progress_apply(lambda sent: len(sent.split(' ')))

In [None]:
# For now, remove "recipes" with 0 calories
df = df[df.calories!=0].reset_index(drop=True)

In [None]:
# Find recipes w/ empty titles
print(df.shape)
indexEmpty = df[df.title==''].index.tolist()
df.drop(index=indexEmpty, inplace=True).reset_index(drop=True, inplace=True)
print(df.shape)

### Collect Images based on the 'picture_link' column

In [None]:
# Check which observations have images?

dfpic = df[(df.picture_link !=0)].reset_index(drop=True)
print('Only', dfpic.shape[0], 'recipes actually have photos\n')

In [None]:
# Function to display the images
def showPicture(df_in, index):
    '''Function returns image, given dataframe and index
    Input:
        df_in .  - dataframe to access
        index    - index of dataframe to show
    '''
    if df_in[df_in.index==index].picture_link.values[0] !=0:
        fname= df_in[df_in.index==index].picture_link.values[0].split('/')[-1]
        with open(f'../data/images/epicurious_images/{fname}', 'rb') as fin:
            image_file = io.BytesIO(fin.read())
        output = Image.open(image_file)
        print(f'{df_in[df_in.index==index].title.values[0]}')
    else:
        with open(f'../data/images/epicurious_images/empty.jpg', 'rb') as fin:
            image_file = io.BytesIO(fin.read())
        output = Image.open(image_file)
        print('No image for this recipe')
    return output

In [None]:
# Display an example recipe
showPicture(df, 526)

### Pickle dataframes
- Add `id` as a recipe identifier

In [None]:
# Create index as id
df = df.reset_index()

# rename this index into `id`
df.rename(columns = {'index':'id'}, inplace=True)

In [None]:
# Create subset of df_INGREDIENTS for exploratory analysis 
df_title = df[['id','title','title_numWords','picture_link']]
df_ingredients = df[['id','title','ingredients', 'tag_ingredient',
                     'picture_link']]

# Pickle sections
with open('../data/df_epi_title.pkl','wb') as fout:
    pickle.dump(df_title, fout)

with open('../data/df_epi_ingred.pkl','wb') as fout:
    pickle.dump(df_ingredients, fout)
    
with open('../data/df_epi_cleaner.pkl','wb') as fout:
    pickle.dump(df, fout)