In [None]:
import os
import json
import pickle

import itertools
import string
import unicodedata
from collections import OrderedDict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import inflect

import pandas as pd


### general stopwords and stopwords specific to recipes ###

In [None]:
stop_words = set(stopwords.words('english'))

with open('ingredient_stops.pickle', 'rb') as f:
    ingredient_stops = pickle.load(f)

### for stemming and punctuation

In [None]:
table = str.maketrans('', '', string.punctuation)
porter = PorterStemmer()

### for removing duplicate recipes

In [None]:
def sort_dict_list(list_to_be_sorted, sort_by):
    return sorted(list_to_be_sorted, key=lambda k: k[sort_by])

### staple foods - names modified for compatibility

In [None]:
df_staples = pd.read_csv('staples_tagged_singular.csv')
food_staples = df_staples['AbbrvName']

## functions to clean ingredients and instructions

In [None]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [None]:
deplural = inflect.engine()

def clean_ingredients(ingredient):
    
    ## tokenize and remove stop words ##
    ingrd_tokens = word_tokenize(ingredient)
    words = [w for w in ingrd_tokens if w.isalpha()]
    words = [w.lower() for w in words]
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if not w in ingredient_stops]
    
    ## words to singular form ##
    for i in words:
        singular = deplural.singular_noun(words[i])
        if singular:
            words[i] = singular
        else:
            continue
    
    ## adjust niche cases ##
    words = [x.replace('stock', 'broth') for x in words]
    words = [x.replace('steak', 'beef') for x in words]
    words = [x.replace('chuck', 'beef') for x in words]
    words = [x.replace('crawfish', 'crayfish') for x in words]
    
    ingredient_clean = ' '.join(words)
    
    ## adjust additional niche cases ##
    ingredient_clean = ingredient_clean.replace('game hen', 'hen')
    ingredient_clean = ingredient_clean.replace('salt pepper', '')
    
    ## call staple foods consistently ##
    for staple in food_staples:
        if staple in ingredient_clean:
            ingredient_clean = staple
            break        
            
    return ingredient_clean


In [None]:
def clean_instructions(doc):
    try:
        tokens = word_tokenize(doc.lower())
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        stemmed = [porter.stem(word) for word in words]
        return stemmed
    except AttributeError:
        return []

## clean spoonacular recipes 

In [None]:
with open('spoonacular_recipes.json') as infile:
    recipe_list = json.loads(infile.read())

In [None]:
ingrds_instr = []
for recipe in recipe_list:
    ingrds_in = recipe['ingredient_names']
    cleaned_ingr_list = [clean_ingredients(item) for item in ingrds_in]
    cleaned_ingr_list = [x for x in cleaned_ingr_list if x]
    
    ## remove duplicate instances of an ingredient
    cleaned_ingr_list = list(OrderedDict.fromkeys(cleaned_ingr_list))
    
    instructions_doc = ' '.join(recipe['instructions'])
    instructions_doc = clean_instructions(instructions_doc)
    ingrds_instr.append((recipe['name'], cleaned_ingr_list, instructions_doc))
    
df_spoon = pd.DataFrame(ingrds_instr, columns = ['names','ingredients', 'instructions'])    

## clean eightportions recipes

In [None]:
path = '''path to eightportions recipes in json format'''
recipe_lists = os.listdir(path)
recipe_lists = [x for x in recipe_lists if x.endswith('.json')]

In [None]:
recipe_attrib = []
for recipe_list in recipe_lists:
    with open(path+recipe_list, encoding = 'utf-8') as infile:
        recipes = json.loads(infile.read())
    for k in recipes.keys():
        recipe = recipes[k]
        try:
            name = recipe['title']
            instructions_out = clean_instructions(recipe['instructions'])
            ingredients_out = [clean_ingredients(i) for i in recipe['ingredients']]
        except KeyError:
            continue
        
        if instructions_out:
            recipe_attrib.append((name, ingredients_out, instructions_out))

        
df_eightportion = pd.DataFrame(recipe_attrib, columns = ['names', 'ingredients', 'instructions'])


In [None]:
unique_ingrds_list = [list(OrderedDict.fromkeys(x)) for x in df_eightportion['ingredients']]
for i in range(len(unique_ingrds_list)):
    unique_ingrds_list[i] = [x for x in unique_ingrds_list[i] if x]
df_eightportion['ingredients'] = unique_ingrds_list

### concatenate dataframes and pickle

In [None]:
df_tot = pd.concat([df_eightportion, df_spoon])
df_tot.drop_duplicates(subset = 'names', inplace = True)

In [None]:
'''
with open('compiled_recipes_total.pickle', 'wb') as f:
    pickle.dump(df_tot, f)
'''