In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import statistics
import nltk
import re

In [124]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.regexp import RegexpStemmer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [15]:
pd.set_option('max_colwidth', 400)

In [86]:
def and_split(ingredient_list):
    """
    Inputs: List of ingredients (with possible "and" or "&")
    Output: List with ingredient list updated to remove "and" or "&" and list as separate items
    
    example: and_split(['turkey','green pepper','salt & freshly ground black pepper'])
            ['turkey', 'green pepper', 'salt', 'freshly ground black pepper']
    """
    return list(np.concatenate([re.split(' and | & ', ingredient) \
                                for ingredient in ingredient_list]).flat)

In [122]:
def make_singular(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to make everything singular
    
    example:make_singular(["carrots", "eggs","milk"])
            returns ['carrot', 'egg', 'milk']
    """
    #stemmer = SnowballStemmer("english")
    stemmer = RegexpStemmer('s$|ies$') 
    return [stemmer.stem(word) for word in ingredient_list]

In [128]:
def remove_adj(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to remove extra adjectives:
            ['fresh', 'ground', 'dried', 'all-purpose']
    example:remove_adj(['fresh ground pepper', 'fresh parsley', 'dried parsley','all-purpose flour'])
            returns ['pepper', 'parsley', 'parsley', 'flour']    
    """
    
    adj_list =  "|".join(['fresh', 'ground', 'dried', 'all-purpose', 'clove', 'unsalted','salted', 'extra virgin'])
    return [re.sub(adj_list,'', ingredient).strip() for ingredient in ingredient_list]

In [2]:
recipes = pd.read_csv('data/large_data/RAW_recipes.csv')

In [3]:
recipes['tags'] = recipes['tags'].apply(literal_eval)

In [4]:
tag_dist = recipes['tags'].explode().value_counts()
tag_list = recipes['tags'].explode().unique()

In [133]:
recipes['mod_ingredients'] = recipes['ingredients'].apply(literal_eval).apply(and_split).apply(make_singular).apply(remove_adj)
full_ingredient_list = recipes['mod_ingredients'].explode().value_counts(normalize = True)

In [135]:
sum(full_ingredient_list)

0.9999999999999017

In [134]:
full_ingredient_list[:50]

salt                    0.050778
butter                  0.030281
garlic                  0.026058
egg                     0.024052
onion                   0.023234
pepper                  0.021067
sugar                   0.020976
flour                   0.020689
olive oil               0.019097
water                   0.016451
milk                    0.012145
black pepper            0.011902
lemon juice             0.009205
cinnamon                0.009149
brown sugar             0.008787
baking powder           0.008245
parsley                 0.007503
parmesan cheese         0.007169
baking soda             0.006641
carrot                  0.006584
vegetable oil           0.006553
vanilla                 0.006271
tomatoe                 0.005985
sour cream              0.005556
ginger                  0.005449
green onion             0.005369
garlic powder           0.005129
cumin                   0.004899
vanilla extract         0.004838
oregano                 0.004804
oil       

In [83]:
text = nltk.word_tokenize("fresh ground black pepper")
nltk.pos_tag(text)

[('fresh', 'JJ'), ('ground', 'NN'), ('black', 'JJ'), ('pepper', 'NN')]

In [5]:
recipes["veg"] = [("vegan" in tag_list) or ("vegetarian" in tag_list) for tag_list in recipes["tags"]]

In [36]:
veg_recipes = recipes[recipes["veg"]].copy()

In [7]:
interactions = pd.read_csv('data/large_data/RAW_interactions.csv')

In [37]:
veg_recipes['ratings'] = [list(interactions[interactions['recipe_id'] == rec_id]['rating'])  for rec_id in veg_recipes['id'] ]

In [38]:
veg_recipes['avg_rating'] = veg_recipes['ratings'].map(statistics.mean)

In [39]:
veg_recipes['n_ratings'] = veg_recipes['ratings'].map(len)

In [40]:
veg_recipes[veg_recipes['n_ratings'] > 10].describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients,avg_rating,n_ratings
count,3128.0,3128.0,3128.0,3128.0,3128.0,3128.0,3128.0
mean,142169.992967,79.190217,162461.2,8.831841,8.501918,4.460244,28.839834
std,110971.657708,500.792949,216336.8,5.544758,3.650987,0.411377,43.603142
min,62.0,0.0,27.0,1.0,2.0,1.764706,11.0
25%,50842.75,17.0,37449.0,5.0,6.0,4.266667,13.0
50%,117433.5,31.0,71324.0,8.0,8.0,4.538462,17.0
75%,204854.75,58.5,209747.0,11.0,11.0,4.75,28.0
max,512584.0,20190.0,2222923.0,82.0,25.0,5.0,1220.0


In [58]:
veg_recipes['mod_ingredients'] = veg_recipes['ingredients'].apply(literal_eval).apply(and_split)

In [59]:
veg_recipes[['mod_ingredients']]
#veg_recipes[['ingredients']]

Unnamed: 0,mod_ingredients
0,"[winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt]"
4,"[tomato juice, apple cider vinegar, sugar, salt, pepper, clove oil, cinnamon oil, dry mustard]"
6,"[fennel seeds, green olives, ripe olives, garlic, peppercorn, orange rind, orange juice, red chile, extra virgin olive oil]"
16,"[egg roll wrap, whole green chilies, cheese, cornstarch, oil]"
19,"[canola oil, onion, garlic, cauliflower, potatoes, vegetable bouillon cubes, water, salt free herb, spice seasoning mix, ground coriander, great northern bean, salt, pepper, broccoli floret, escarole, green peas, red bell pepper, fresh herb]"
...,...
231610,"[white bread flour, instant yeast, sugar, salt, water]"
231613,"[sorrel, butter, sugar, raisins]"
231620,"[milk, butter, dry yeast, water, salt, sugar, flour]"
231625,"[butter, onions, flour, salt, vinegar]"


In [60]:
search_phrase = "butternut squash soup"
search_words = search_phrase.split(" ")
search_words = "|".join("^{}$".format(word) for word in search_words)

In [61]:
recipe_list = veg_recipes.loc[veg_recipes['name'].str.contains(search_phrase, case=False)]

In [62]:
recipe_list.describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients,avg_rating,n_ratings
count,23.0,23.0,23.0,23.0,23.0,23.0,23.0
mean,238846.217391,62.913043,446365.9,12.26087,11.217391,4.304743,4.26087
std,142403.200255,38.257397,545859.1,6.091853,3.789417,0.821696,6.565872
min,11636.0,9.0,3288.0,5.0,6.0,2.333333,1.0
25%,157662.0,35.0,87028.5,7.5,7.5,4.0,1.0
50%,226765.0,48.0,239897.0,12.0,11.0,4.714286,2.0
75%,313188.5,90.0,557543.5,15.0,13.5,5.0,4.0
max,490345.0,170.0,2462143.0,27.0,21.0,5.0,31.0


In [67]:
common_ingredients = recipe_list['mod_ingredients'].explode().value_counts()

In [69]:
common_ingredients[:20]

butternut squash    21
salt                14
vegetable stock      9
olive oil            9
garlic cloves        8
onion                7
water                7
pepper               7
butter               7
celery               5
carrots              5
coconut milk         5
curry powder         4
vegetable broth      4
nutmeg               4
ground coriander     3
onions               3
garlic               3
ground cumin         3
black beans          2
Name: mod_ingredients, dtype: int64

In [70]:
common_ingredients.filter(like = "pepper", axis = 0)

pepper                         7
red bell pepper                2
black pepper                   2
freshly ground black pepper    1
ground pepper                  1
ground red pepper              1
red pepper flakes              1
white pepper                   1
yellow bell pepper             1
scotch bonnet pepper           1
dry crushed red pepper         1
chili pepper                   1
cayenne pepper                 1
Name: mod_ingredients, dtype: int64

In [33]:
veg_recipes['ingredients']

0                                                                                                                                                                                             [['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil', 'salt']]
4                                                                                                                                                                            [['tomato juice', 'apple cider vinegar', 'sugar', 'salt', 'pepper', 'clove oil', 'cinnamon oil', 'dry mustard']]
6                                                                                                                                             [['fennel seeds', 'green olives', 'ripe olives', 'garlic', 'peppercorn', 'orange rind', 'orange juice', 'red chile', 'extra virgin olive oil']]
16                                                                                                                                            