In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import statistics
import nltk
import re
from scipy import stats
from venn import venn
import matplotlib
import squarify  
import inflect
%matplotlib inline


In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.regexp import RegexpStemmer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [3]:
pd.set_option('max_colwidth', 400)

In [4]:
def and_split(ingredient_list):
    """
    Inputs: List of ingredients (with possible "and" or "&")
    Output: List with ingredient list updated to remove "and" or "&" and list as separate items
    
    example: and_split(['turkey','green pepper','salt & freshly ground black pepper'])
            ['turkey', 'green pepper', 'salt', 'freshly ground black pepper']
    """
    return list(np.concatenate([re.split(' and | & ', ingredient) \
                                for ingredient in ingredient_list]).flat)

In [34]:
def make_singular(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to make everything singular
    
    example:make_singular(["carrots", "eggs","milk"])
            returns ['carrot', 'egg', 'milk']
    """
    #edge case for couscous needs to be checked, berries, tomatoes
    #stemmer = SnowballStemmer("english")
    #stemmer = RegexpStemmer('s$|ies$') 
    p = inflect.engine()
    return [p.singular_noun(word) if p.singular_noun(word) else word for word in ingredient_list]

In [41]:
def remove_adj(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient list updated to remove extra adjectives:
            ['fresh', 'ground', 'dried', 'all-purpose']
    example:remove_adj(['fresh ground pepper', 'fresh parsley', 'dried parsley','all-purpose flour'])
            returns ['pepper', 'parsley', 'parsley', 'flour']    
    """
    
    adj_list =  "|".join(['fresh','chopped','canned','baby' 'raw','frozen','whole', 'stewed',\
                          'ground','granulated', 'dried', 'all-purpose','granulated',\
                          'unsalted','salted', 'extra virgin', 'ripe','sauce','diced', 'crushed'])
    return [re.sub(adj_list,'', ingredient).strip() for ingredient in ingredient_list]    

In [7]:
def check_subsitutions(sub_list, ingredient):
    new_ingredient = ingredient
    for word in sub_list:
        check = re.search(word, ingredient, re.IGNORECASE)
        new_ingredient = check.group(1) if check else new_ingredient
    return new_ingredient

def substitutions(ingredient_list):
    """
    Inputs: List of ingredients 
    Output: List with ingredient with common subsitituions 
    example: substitutions(['garlic cloves', 'herb and salt spice mix',  'low fat firm tofu', 'whole milk' ])
            returns ['garlic', 'spice', 'tofu', 'milk']    
    """
    sub_list = ['(garlic)', '(flour)','(yogurt)','(spice)', '(oil)','(tofu)', '(milk)', '(vinegar)',\
                '(butter)', '(rice)', '(cheese)', '(juice)', '(syrup)', '(soup)','(tortilla)', '(yam)']
    return [check_subsitutions(sub_list, ingredient) for ingredient in ingredient_list]
    #return [ingredient.extract(pat = r'("garlic")') for ingredient in ingredient_list]

In [8]:
recipes = pd.read_csv('data/large_data/RAW_recipes.csv')

In [9]:
recipes['tags'] = recipes['tags'].apply(literal_eval)

In [10]:
tag_dist = recipes['tags'].explode().value_counts()
tag_list = recipes['tags'].explode().unique()

In [42]:
recipes['mod_ingredients'] = recipes['ingredients'].apply(literal_eval).apply(and_split).apply(make_singular).apply(substitutions).apply(remove_adj)

In [43]:
def get_ingredient_frequency(recipes):
    full_ingredient_list = recipes['mod_ingredients'].explode()
    ingredient_freq = full_ingredient_list.value_counts()
    ingredient_freq = ingredient_freq.loc[ingredient_freq > 100]
    ingredient_freq  = ingredient_freq/sum(ingredient_freq)
    return ingredient_freq

In [44]:
ingredient_freq = get_ingredient_frequency(recipes)

In [14]:
ingredient_freq.to_csv('data/ingredient_freq.csv')

In [45]:
ingredient_freq.loc[np.abs(stats.zscore(ingredient_freq)) > 4]
#ingredient_freq.quantile(.50)

salt      0.053574
oil       0.042744
butter    0.038875
cheese    0.037848
garlic    0.035801
flour     0.028932
egg       0.025376
sugar     0.025239
onion     0.024821
juice     0.022387
pepper    0.022264
milk      0.021579
water     0.017357
Name: mod_ingredients, dtype: float64

In [16]:
ingredient_freq.loc['peanut']

0.0006646137927260607

In [46]:
ingredient_freq.filter(like="tomato", axis = 0)

tomato                       0.012358
tomato paste                 0.002154
cherry tomato                0.000820
plum tomato                  0.000739
roma tomato                  0.000541
sun- tomato                  0.000500
stewed tomato                0.000441
tomato puree                 0.000352
grape tomato                 0.000288
rotel tomato                 0.000206
tomato ketchup               0.000092
tomato with green chilies    0.000082
green tomato                 0.000071
italian tomato               0.000069
italian plum tomato          0.000068
italian-style  tomato        0.000064
Name: mod_ingredients, dtype: float64

In [49]:
full_ingredient_list[:100]

0             winter squash
0         mexican seasoning
0                     spice
0                     honey
0                    butter
              ...          
9               baking soda
9                      salt
10    berry cranberry sauce
10               sour cream
10     prepared horseradish
Name: mod_ingredients, Length: 100, dtype: object

In [19]:
recipes["veg"] = [("vegan" in tag_list) or ("vegetarian" in tag_list) for tag_list in recipes["tags"]]

In [20]:
interactions = pd.read_csv('data/large_data/RAW_interactions.csv')

In [21]:
recipes['ratings'] = [list(interactions[interactions['recipe_id'] == rec_id]['rating'])  for rec_id in recipes['id'] ]

In [22]:
recipes['n_ratings'] = recipes['ratings'].map(len)

In [23]:
recipes['avg_rating'] = recipes['ratings'].map(statistics.mean)

In [None]:
recipes.dropna(subset = ['name'], inplace = True)

In [25]:
recipes.to_csv('data/large_data/recipes.csv', index = False)

In [None]:
veg_recipes = recipes[recipes["veg"]].copy()

In [None]:
veg_recipes['ratings'] = [list(interactions[interactions['recipe_id'] == rec_id]['rating'])  for rec_id in veg_recipes['id'] ]

In [None]:
veg_recipes['avg_rating'] = veg_recipes['ratings'].map(statistics.mean)

In [None]:
veg_recipes['n_ratings'] = veg_recipes['ratings'].map(len)

In [None]:
veg_recipes[veg_recipes['n_ratings'] > 10].describe()

In [None]:
veg_recipes['mod_ingredients'] = veg_recipes['ingredients'].apply(literal_eval).apply(and_split)

In [None]:
veg_recipes[['mod_ingredients']]
#veg_recipes[['ingredients']]

In [None]:
search_phrase = "butternut squash soup"
search_words = search_phrase.split(" ")
search_words = "|".join("^{}$".format(word) for word in search_words)

In [None]:
recipe_list = veg_recipes.loc[veg_recipes['name'].str.contains(search_phrase, case=False)].copy()

In [None]:
recipe_list.describe()

In [None]:
common_ingredients = recipe_list['mod_ingredients'].explode().value_counts()

In [None]:
common_ingredients.filter(like = "pepper", axis = 0)

In [None]:
veg_recipes['ingredients']

In [None]:
musicians = {
    "Members of The Beatles": {"Paul McCartney", "John Lennon", "George Harrison", "Ringo Starr"},
    "Guitarists": {"John Lennon", "George Harrison", "Jimi Hendrix", "Eric Clapton", "Carlos Santana"},
    "Played at Woodstock": {"Jimi Hendrix", "Carlos Santana", "Keith Moon"}
}
venn(musicians)

In [None]:
recipe_list['ingredient_score'] = recipe_list['mod_ingredients'].apply(calc_ingredient_ratings)

In [None]:
def calc_ingredient_ratings(ingredient_list, ingredient_freq= ingredient_freq):
    #print(ingredient_list) 
    rating = 0
    for ingredient in ingredient_list:
        try:
            freq = ingredient_freq.loc[ingredient]
        except:
            freq = 0
        rating += freq
    return rating
    
calc_ingredient_ratings(['salt','butter','pepper','shoe'])

In [None]:
recipe_list.describe()

In [None]:
recipe_list['overall_score'] = 1/recipe_list['minutes'] + recipe_list['avg_rating'] + recipe_list['ingredient_score']*100 - recipe_list['n_steps']/10 - recipe_list['n_ingredients']/10

In [None]:
#recipe_list.sort_values(by = 'overall_score', ascending=False)
recipe_list.nlargest(5, 'overall_score')

In [None]:
def show_top_recipes(recipe_list):
    html = 'https://www.food.com/recipe/'
    common_ingredients = recipe_list['mod_ingredients'].explode().value_counts()
    cmap = matplotlib.cm.Blues
    norm = matplotlib.colors.Normalize(vmin=min(common_ingredients), vmax=max(common_ingredients))
    colors = [cmap(norm(value)) for value in common_ingredients]
    squarify.plot(sizes=common_ingredients[:10], label=common_ingredients.index[:10], alpha=.5, text_kwargs={"wrap": True})
    plt.axis('off')
    plt.show()
    top_5 = recipe_list.nlargest(5, 'overall_score').copy()
    top_5['recipe_link'] = [f'https://www.food.com/recipe/{ing_id}' for ing_id in top_5['id']]
    return top_5[['name','id','overall_score','recipe_link']]


In [35]:
make_singular(['christmas','milk','trees','couscous'])

['christma', 'milk', 'tree', 'couscou']