In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import statistics
import nltk
import re
from scipy import stats

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.regexp import RegexpStemmer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [3]:
pd.set_option('max_colwidth', 400)

In [4]:
def and_split(ingredient_list):
    """
    Inputs: List of ingredients (with possible "and" or "&")
    Output: List with ingredient list updated to remove "and" or "&" and list as separate items
    
    example: and_split(['turkey','green pepper','salt & freshly ground black pepper'])
            ['turkey', 'green pepper', 'salt', 'freshly ground black pepper']
    """
    return list(np.concatenate([re.split(' and | & ', ingredient) \
                                for ingredient in ingredient_list]).flat)

In [5]:
def make_singular(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to make everything singular
    
    example:make_singular(["carrots", "eggs","milk"])
            returns ['carrot', 'egg', 'milk']
    """
    #edge case for couscous needs to be checked
    #stemmer = SnowballStemmer("english")
    stemmer = RegexpStemmer('s$|ies$') 
    return [stemmer.stem(word) for word in ingredient_list]

In [105]:
def remove_adj(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to remove extra adjectives:
            ['fresh', 'ground', 'dried', 'all-purpose']
    example:remove_adj(['fresh ground pepper', 'fresh parsley', 'dried parsley','all-purpose flour'])
            returns ['pepper', 'parsley', 'parsley', 'flour']    
    """
    
    adj_list =  "|".join(['fresh','chopped','canned','baby' 'raw','frozen','whole',\
                          'ground','granulated', 'dried', 'all-purpose','granulated',\
                          'unsalted','salted', 'extra virgin', 'ripe'])
    return [re.sub(adj_list,'', ingredient).strip() for ingredient in ingredient_list]    

In [110]:
def check_subsitutions(sub_list, ingredient):
    new_ingredient = ingredient
    for word in sub_list:
        check = re.search(word, ingredient, re.IGNORECASE)
        new_ingredient = check.group(1) if check else new_ingredient
    return new_ingredient

def substitutions(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient with common subsitituions 
    example: substitutions(['garlic cloves', 'herb and salt spice mix',  'low fat firm tofu', 'whole milk' ])
            returns ['garlic', 'spice', 'tofu', 'milk']    
    """
    sub_list = ['(garlic)', '(flour)','(yogurt)','(spice)', '(oil)','(tofu)', '(milk)', '(vinegar)',\
                '(butter)', '(rice)', '(cheese)', '(juice)', '(syrup)', '(soup)','(tortilla)', '(yam)']
    return [check_subsitutions(sub_list, ingredient) for ingredient in ingredient_list]
    #return [ingredient.extract(pat = r'("garlic")') for ingredient in ingredient_list]

In [8]:
recipes = pd.read_csv('data/large_data/RAW_recipes.csv')

In [9]:
recipes['tags'] = recipes['tags'].apply(literal_eval)

In [10]:
tag_dist = recipes['tags'].explode().value_counts()
tag_list = recipes['tags'].explode().unique()

In [111]:
recipes['mod_ingredients'] = recipes['ingredients'].apply(literal_eval).apply(and_split).apply(make_singular).apply(substitutions).apply(remove_adj)

In [112]:
full_ingredient_list = recipes['mod_ingredients'].explode()
ingredient_freq = full_ingredient_list.value_counts()

In [40]:
ingredient_freq = ingredient_freq.loc[ingredient_freq > 100]

In [41]:
ingredient_freq  = ingredient_freq/sum(ingredient_freq)

In [115]:
ingredient_freq.loc[np.abs(stats.zscore(ingredient_freq)) > 4]
#ingredient_freq.quantile(.50)

salt               107808
oil                 86014
butter              78229
cheese              76161
garlic              72043
flour               58220
egg                 51065
sugar               50789
onion               49394
juice               45049
pepper              44754
milk                43424
water               34927
black pepper        25270
vinegar             19744
cinnamon            19424
brown sugar         18660
baking powder       17504
parsley             15930
rice                15372
tomatoe             14467
baking soda         14099
carrot              14025
vanilla             13315
sour cream          11796
ginger              11569
green onion         11421
cumin               10402
vanilla extract     10271
oregano             10200
honey                9898
soup                 9815
Name: mod_ingredients, dtype: int64

In [103]:
ingredient_freq.loc['peanut']

1384

In [116]:
ingredient_freq.filter(like="tomato", axis = 0)

tomatoe                         14467
tomato sauce                     4480
tomato paste                     4335
diced tomatoe                    3462
cherry tomatoe                   1651
                                ...  
reduced-calorie tomato sauce        1
diced tomato spaghetti sauce        1
sugar-free tomato puree             1
ro*tel original tomato              1
hunt's diced tomatoe                1
Name: mod_ingredients, Length: 214, dtype: int64

In [28]:
full_ingredient_list[-100:]

alpine lace 97% reduced-fat cooked ham       4.710083e-07
shimeji mushroom                             4.710083e-07
salt-free seasoned  breadcrumb               4.710083e-07
pickled mackerel                             4.710083e-07
pineapple orange drink mix                   4.710083e-07
                                                 ...     
spearmint candy leaf                         4.710083e-07
fat-free blueberry muffin mix                4.710083e-07
gelato                                       4.710083e-07
moong dahl                                   4.710083e-07
brown sugar flavored hot wheat cereal mix    4.710083e-07
Name: mod_ingredients, Length: 100, dtype: float64

In [None]:
recipes["veg"] = [("vegan" in tag_list) or ("vegetarian" in tag_list) for tag_list in recipes["tags"]]

In [None]:
veg_recipes = recipes[recipes["veg"]].copy()

In [None]:
interactions = pd.read_csv('data/large_data/RAW_interactions.csv')

In [None]:
veg_recipes['ratings'] = [list(interactions[interactions['recipe_id'] == rec_id]['rating'])  for rec_id in veg_recipes['id'] ]

In [None]:
veg_recipes['avg_rating'] = veg_recipes['ratings'].map(statistics.mean)

In [None]:
veg_recipes['n_ratings'] = veg_recipes['ratings'].map(len)

In [None]:
veg_recipes[veg_recipes['n_ratings'] > 10].describe()

In [None]:
veg_recipes['mod_ingredients'] = veg_recipes['ingredients'].apply(literal_eval).apply(and_split)

In [None]:
veg_recipes[['mod_ingredients']]
#veg_recipes[['ingredients']]

In [None]:
#search_phrase = input()

In [None]:
search_phrase = "butternut squash soup"
search_words = search_phrase.split(" ")
search_words = "|".join("^{}$".format(word) for word in search_words)

In [None]:
recipe_list = veg_recipes.loc[veg_recipes['name'].str.contains(search_phrase, case=False)]

In [None]:
recipe_list.describe()

In [None]:
common_ingredients = recipe_list['mod_ingredients'].explode().value_counts()

In [None]:
common_ingredients[:20]

In [None]:
common_ingredients.filter(like = "pepper", axis = 0)

In [None]:
veg_recipes['ingredients']