In [5]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import nltk, string 
import lda
import matplotlib.pyplot as plt

from pymongo import MongoClient
from textblob import TextBlob
from textblob import Word
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from nltk.tokenize import word_tokenize
from collections import defaultdict
from gensim import corpora, models, similarities, matutils
from gensim.models import Word2Vec
import re
import spacy
from spacy.en import English
from spacy.symbols import LEMMA, TAG, POS, ORTH

%matplotlib inline


def set_first_element(df, column):
    """This is for preprocessing - 
    grabbing the first element in a list or setting empty lists to NaN"""
    count = len(df[column])
    for x in range(count):
        try:
            df[column].iloc[x] = df[column].iloc[x][0]
        except:
            df[column].iloc[x] = np.nan

def letters_only(word):
    return word.isalpha()

def lemma(words):    
    sent = []
    token = nlp(words)
    for word in token:
        sent.append(word.lemma_)
    return(" ".join(sent))

def get_nouns_adjs(words):
    """This pulls out any nouns + adjs in the words"""
    nouns_adjs = []
    tags = ['NN', 'NNP', 'NNPS', 'NNS', 'JJ', 'JJS']
    doc = nlp(words)
    for word in doc:
        if word.tag_ in tags and word.text not in stop and letters_only(word.text):
            nouns_adjs.append(word.text)
    return nouns_adjs

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def get_cook_type(text):
    cooktype = ['bak', 'grill', 'simmer', 'smok', 'fry', 'saut', 'stir-fry', 'broil', 
                'microwave', 'blanch', 'boil', 'steam']
    cookt = ['baking', 'grilling', 'simmering', 'smoking', 'frying', 'sautéing', 'stir-frying', 'broiling', 
            'microwaving', 'blanching', 'boiling', 'steaming']
    
    methods = ['any']
    for index, x in enumerate(cooktype):
        if re.search(x, text.lower().strip()):
            methods.append(cookt[index])
    if len(methods) > 1:
        return " ".join(methods)
    return 'no_cook'

def calc_pmi(df, a, b):
    count = df.shape[0]
    a_prob = df.groupby(a)[a].sum()[1]/count
    b_prob = df.groupby(b)[b].sum()[1]/count
    ab_prob = df.groupby([a, b])[b].sum()[1][1]/count
    pmi = np.log(ab_prob/(a_prob*b_prob))
    return pmi

Using TensorFlow backend.


In [2]:
parser = English()
nlp = spacy.load('en')

In [None]:
client = MongoClient()
db = client.recipes
posts = db.pagesv3

posts.count()

In [None]:
df_recipes = pd.DataFrame(list(posts.find()))
df_recipes.head()

In [None]:
df_recipes = df_recipes.drop('_id', 1)

In [4]:
len(df_recipes['title'][0])

65

In [None]:
set_first_element(df_recipes, 'title')

In [None]:
set_first_element(df_recipes, 'image')

In [None]:
set_first_element(df_recipes, 'time')

In [5]:
df_recipes.tail()

Unnamed: 0,image,ingredients_list,instructions,tags_list,time,title,url,ing_str,nouns,lem,trigrams
5607,http://www.seriouseats.com/recipes/assets_c/20...,"[4 thin, square slices white bread, 1 1/2 tabl...",Turn on the oven to 500°F. Butter each slice ...,"[cheese, Cook the Book, CTB: Alone in the Kitc...",,Cook the Book: Il Tost (Grilled Ham and Cheese),http://www.seriouseats.com/recipes/2007/08/il-...,"4 thin, square slices white bread 1 1/2 tables...","[white, bread, butter, import, italian, fontin...","4 thin , square slice white bread 1 1/2 tables...",white bread butter import italian fontina_chee...
5608,http://www.seriouseats.com/recipes/assets_c/20...,"[12 sage leaves, 2 tablespoons olive oil, 4 sl...",Roughly chop the sage and toss it into a small...,"[Dinner Tonight, grilled cheese, sage]",,Dinner Tonight: Sage Grilled Cheese Recipe,http://www.seriouseats.com/recipes/2007/08/din...,12 sage leaves 2 tablespoons olive oil 4 slice...,"[sage, leaf, olive, oil, rustic, bread, fontin...",12 sage leaf 2 tablespoon olive oil 4 slice of...,sage_leaf olive_oil rustic_bread fontina_chees...
5609,http://www.seriouseats.com/recipes/assets_c/20...,[1 French baguette or 4 submarine rolls Tuna S...,Preheat oven to 375°F Split the baguette open...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",,Cook the Book: Jersey Shore Tuna Sub,http://www.seriouseats.com/recipes/2007/06/coo...,1 french baguette or 4 submarine rolls tuna sa...,"[french, baguette, submarine, roll, tuna, sala...",1 french baguette or 4 submarine roll tuna sal...,french_baguette submarine roll tuna salad pick...
5610,http://www.seriouseats.com/recipes/assets_c/20...,"[4 New Englandstyle hot dog buns , 4 tablespo...",Heat a lO-inch skillet over medium heat. Sprea...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",,Cook the Book: Classic Maine Lobster Rolls,http://www.seriouseats.com/recipes/2007/06/cla...,4 new englandstyle hot dog buns 4 tablespoon...,"[new, hot, dog, bun, unsalted, butter, boston,...",4 new englandstyle hot dog bun 4 tablespoon...,new hot dog bun unsalted_butter boston bibb le...
5611,http://www.seriouseats.com/recipes/assets_c/20...,[],,"[Cook the Book, cookbooks, cooking tips, CTB: ...",,Cook the Book: Grilled Cheese Sandwich,http://www.seriouseats.com/recipes/2007/05/coo...,,[],,


In [None]:
df_recipes['ing_str'] = df_recipes['ingredients_list'].apply(lambda x: " ".join(x))

In [None]:
df_recipes['ing_str']

In [None]:
df_recipes['ing_str'] = df_recipes['ing_str'].apply(lambda x: x.lower())

In [222]:
stop = stopwords.words('english')
stop += ['teaspoon', 'tablespoon', 'cup', 'medium', 'large', 'torn',
         'pound', 'serving', 'note', 'pinch', 'piece', 'cut', 'head', 
         'leaves', 'ground', 'tender', 'split', 'lengthwise', 'water', 
         'warm', 'whole', 'diced', 'bowl',  'hand', 'recipe', 'ounce', 
         'gram', 'dice', 'head', 'g',  'mince', 'slice', 'drain', 'chill', 
         'floret', 'half', 'quarter', 'homemade', 'garnish', 'brand', 
         'inch', 'thick', 'warm', 'cold', 'diamond crystal', 'clean',
         'plenty', 'chunk', 'total', 'optional', 'bowl', 'thin', 
         'julienned', 'bias', 'sturdy', 'fresh', 'store', 'round', 
         'knob', 'cube', 'cubed', 'necessary', 'reserve', 'mixed', 
         'shredded', 'size', 'disk', 'stalk', 'package', 'direction',
         'able', 'acceptable', 'accompaniment', 'high_quality', 'handful',
         'additional', 'small', 'inner', 'intact', 'trim', 'square', 
         'scale','quart', 'halves', 'oz', 'day', 'old', 'possible', 'part', 
         'accord','gutt', 'wedge', 'bunch', 'slice', 'grate', 'microplane', 'ripe',
         'core', 'branch', 'debone', 'moist', 'edible', 'thaw', 'freeze', 
         'box','melt', 'organic', 'grind', 'fire', 'unpeel', 'removed', 
         'herb', 'knife', 'pith', 'remove', 'paper', 'section', 'pit', 
         'strip', 'scrub', 'hard_boil', 'scant', 'roast', 'chop', 'available', 
         'fry', 'excess', 'room', 'temperature', 'enough', 'pot', 'deep', 
         'frying', 'scald', 'rinse', 'dozen', 'dry', 'hour', 'least', 'raw', 
         'ne', 'much', 'kg', 'devein', 'peel', 'pounds', 'zha', 'zippered', 'abut', 
         'ans', 'cook', 'crunch', 'crunchy', 'cover', 'al', 'alt', 'alternative',
        'anything', 'appropriate', 'artisanal', 'approx', 'approximate', 'area',
        'assemble', 'assembly', 'assorted', 'assort', 'assymmetrical', 'attach',
        'attachment', 'available', 'br', 'kilogram', 'liter', 'milimeter', 
         'tb', 'tbs', 'tbsp', 'mililiter', 'natural', 'need', 'use', 'volume', 'tsp', 'salt', 
         'kosher', 'packed', 'dust', 'stick', 'stem', 'hole', 'grater', 'buy', 'end', 'serve',
        'separate', 'divide', 'length', 'long', 'lenght', 'spear', 'whichev']




In [107]:
print(get_nouns_adjs(df_recipes['ing_str'][2368]))

['wheat', 'bran', 'raisins', 'brandy', 'buttermilk', 'brown', 'sugar', 'vegetable', 'oil', 'egg', 'egg', 'white', 'flour', 'wheat', 'flour', 'baking', 'powder', 'baking', 'soda']


A little bit of time travel here - I noticed that some of the nouns had either spelling errors or improper lemmas due to the technical nature of content (ex. zucchinis should be stemmed to zucchini). This is to correct for that

In [108]:
nlp.tokenizer.add_special_case(u'zucchinis', [{ORTH: u'zucchini', LEMMA: u'zucchini', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'anchos', [{ORTH: u'ancho', LEMMA: u'ancho', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'filet', [{ORTH: u'fillet', LEMMA: u'fillet', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'annato', [{ORTH: u'annatto', LEMMA: u'annatto', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'baguett', [{ORTH: u'baguette', LEMMA: u'baguette', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'boneles', [{ORTH: u'boneless', LEMMA: u'boneless', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'boneneless', [{ORTH: u'boneless', LEMMA: u'boneless', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'bonesless', [{ORTH: u'boneless', LEMMA: u'boneless', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'bouillion', [{ORTH: u'bouillon', LEMMA: u'bouillon', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'bonesless', [{ORTH: u'boneless', LEMMA: u'boneless', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'brussel', [{ORTH: u'brussels', LEMMA: u'brussels', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cannelini', [{ORTH: u'cannellini', LEMMA: u'cannellini', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cardamon', [{ORTH: u'cardamom', LEMMA: u'cardamom', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cardmom', [{ORTH: u'cardamom', LEMMA: u'cardamom', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cashews', [{ORTH: u'cashew', LEMMA: u'cashew', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'chickpeas', [{ORTH: u'chickpea', LEMMA: u'chickpea', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'chiles', [{ORTH: u'chile', LEMMA: u'chile', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'chilie', [{ORTH: u'chile', LEMMA: u'chile', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'chilis', [{ORTH: u'chile', LEMMA: u'chile', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'chillis', [{ORTH: u'chile', LEMMA: u'chile', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cilanto', [{ORTH: u'cilantro', LEMMA: u'cilantro', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'cobs', [{ORTH: u'cob', LEMMA: u'cob', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'dakon', [{ORTH: u'daikon', LEMMA: u'daikon', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'fresnos', [{ORTH: u'fresno', LEMMA: u'fresno', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'habenero', [{ORTH: u'habanero', LEMMA: u'habanero', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'harisa', [{ORTH: u'harissa', LEMMA: u'harissa', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'jalapenos', [{ORTH: u'jalapeno', LEMMA: u'jalapeno', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'jalepeno', [{ORTH: u'jalapeno', LEMMA: u'jalapeno', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'jalalpenos', [{ORTH: u'jalapeno', LEMMA: u'jalapeno', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'marzanos', [{ORTH: u'marzano', LEMMA: u'marzano', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'matzos', [{ORTH: u'matzo', LEMMA: u'matzo', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'nibs', [{ORTH: u'nib', LEMMA: u'nib', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'oats', [{ORTH: u'oat', LEMMA: u'oat', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'oreos', [{ORTH: u'oreo', LEMMA: u'oreo', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'oxtails', [{ORTH: u'oxtail', LEMMA: u'oxtail', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'parmigianno', [{ORTH: u'parmigiano', LEMMA: u'parmigiano', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'parmiggiano', [{ORTH: u'parmigiano', LEMMA: u'parmigiano', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'parmagiano', [{ORTH: u'parmigiano', LEMMA: u'parmigiano', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'parmigano', [{ORTH: u'parmigiano', LEMMA: u'parmigiano', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'peper', [{ORTH: u'pepper', LEMMA: u'pepper', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'pepperonicinis', [{ORTH: u'pepperonicini', LEMMA: u'pepperonicini', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'pitas', [{ORTH: u'pita', LEMMA: u'pita', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'ribs', [{ORTH: u'rib', LEMMA: u'rib', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'salsas', [{ORTH: u'salsa', LEMMA: u'salsa', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'shallots', [{ORTH: u'shallot', LEMMA: u'shallot', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'solids', [{ORTH: u'solid', LEMMA: u'solid', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'sprigs', [{ORTH: u'sprig', LEMMA: u'sprig', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'tomatillos', [{ORTH: u'tomatillo', LEMMA: u'tomatillo', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'tortillas', [{ORTH: u'tortilla', LEMMA: u'tortilla', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'tostadas', [{ORTH: u'tostada', LEMMA: u'tostada', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'unsalted', [{ORTH: u'unsalt', LEMMA: u'unsalt', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'vietnamese', [{ORTH: u'vietnam', LEMMA: u'vietnam', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'yams', [{ORTH: u'yam', LEMMA: u'yam', TAG: u'NN'}])
nlp.tokenizer.add_special_case(u'yolks', [{ORTH: u'yolk', LEMMA: u'yolk', TAG: u'NN'}])

In [185]:
df_recipes['lem'] = df_recipes['ing_str'].apply(lambda x: lemma(x))
df_recipes['nouns'] = df_recipes['lem'].apply(lambda x: get_nouns_adjs(x))

In [186]:
nphrases = Phrases(df_recipes['nouns'], threshold = .1)

In [187]:
bigram = Phraser(nphrases)

In [188]:
print(set(bigram[df_recipes['nouns'][141]]))

{'fennel_bulb', 'extra_virgin', 'juice_lemon', 'salmon_fillet', 'bay_leaf', 'shallot_shallot', 'onion_celery', 'olive_oil', 'full_fat', 'center', 'sprig_thyme', 'yogurt', 'boneless_skinless', 'dill', 'coriander_seed', 'black_pepper', 'leek'}


In [189]:
trigram = Phrases(bigram[df_recipes['nouns']])

In [190]:
print(trigram[df_recipes['nouns'][2353]])

['pie_crust', 'favorite', 'butter_onion', 'leek', 'hanger_steak', 'parsnip', 'potato', 'carrot', 'sprig_thyme', 'black_pepper', 'egg']




In [191]:
df_recipes['trigrams'] = df_recipes['nouns'].apply(lambda x: " ".join(set(trigram[x])))



In [192]:
df_recipes.head()


Unnamed: 0,image,ingredients_list,instructions,tags_list,time,title,url,ing_str,nouns,lem,trigrams,method
0,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup rice vinegar (120ml), 1/2 cup mirin (...","In a small saucepan, combine soy sauce, vineg...","[asian, chicken, dip, grill, grilling with vin...",45 minutes,Sweet-and-Sour Grilled Chicken Skewers (Yakito...,http://www.seriouseats.com/recipes/2017/08/chi...,1/2 cup rice vinegar (120ml) 1/2 cup mirin (12...,"[rice, vinegar, mirin, soy, sauce, sugar, bone...",1/2 cup rice vinegar ( 120ml ) 1/2 cup mirin (...,kosho thigh togarashi sauce yuzu boneless_skin...,grilling
1,http://www.seriouseats.com/recipes/assets_c/20...,"[1 large tomato, cored, seeded, and cut into 1...",Season tomatoes with 1 teaspoon salt and toss ...,"[brazilian, fish, grilling, south american, vi...",1 hour,Grilled Whole Fish With Molho à Campanha (Braz...,http://www.seriouseats.com/recipes/2017/08/gri...,"1 large tomato, cored, seeded, and cut into 1/...","[tomato, seed, red, onion, green, bell, pepper...","1 large tomato , core , seed , and cut into 1/...",oil fish parsley_leaf flat_leaf white_wine tro...,grilling
2,http://www.seriouseats.com/recipes/assets_c/20...,"[12 ounces freshly ground beef (340g), prefera...","In a medium bowl, gently mix together beef wit...","[beef, burger, grilling, olive, summer, vinegar]",30 minutes,Grilled Black-Olive Burger With Japanese Vineg...,http://www.seriouseats.com/recipes/2017/08/bla...,"12 ounces freshly ground beef (340g), preferab...","[beef, fat, japanese, black, vinegar, chinese,...","12 ounce freshly grind beef ( 340 g ) , prefer...",black beef_fat japanese black_olive brioche ch...,grilling
3,http://www.seriouseats.com/recipes/assets_c/20...,[1 large head cauliflower (about 1 1/2 pounds;...,Preheat oven to 450°F (230°C) and place rack i...,"[almond, cauliflower, indian, raisin, spice, v...",1 hour 15 minutes,Creamy Almond Mughlai Cauliflower Recipe,http://www.seriouseats.com/recipes/2017/08/cre...,1 large head cauliflower (about 1 1/2 pounds; ...,"[cauliflower, vegetable, oil, black, pepper, y...",1 large head cauliflower ( about 1 1/2 pound ;...,yellow_onion chili_flake golden_raisin sliver_...,baking simmering sautéing
4,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup (120ml) cane vinegar, preferably , b...","In a medium saucepan, combine vinegar, soy sau...","[asian, easy, filipino, grill, grilling, grill...",1 hour,Adobo-Marinated Grilled Pork Chops Recipe,http://www.seriouseats.com/recipes/2017/07/ado...,"1/2 cup (120ml) cane vinegar, preferably bra...","[cane, vinegar, soy, sauce, clove, garlic, bla...","1/2 cup ( 120ml ) cane vinegar , preferably ...",clove_garlic sauce bay_leaf cane sticky bone_p...,grilling boiling


Extracting out cooking methods to potentially use as a filtering system for the user. First let's see how many recipes don't have instructions or ingredients, which we would want to get rid of. 

In [18]:
df_recipes[df_recipes['instructions'] == ""].count()

image               44
ingredients_list    50
instructions        50
tags_list           50
time                 0
title               25
url                 50
ing_str             50
nouns               50
lem                 50
trigrams            50
dtype: int64

In [19]:
df_recipes = df_recipes.drop(df_recipes[df_recipes['instructions'] == ""].index).reset_index()

In [117]:
df_recipes = df_recipes.drop(df_recipes[df_recipes['trigrams'] == ""].index)

In [50]:
df_recipes = pd.read_pickle('data/df_recipes')

In [51]:
df_recipes['method'] = df_recipes['instructions'].apply(lambda x: get_cook_type(x))

In [52]:
df_recipes.head()

Unnamed: 0,image,ingredients_list,instructions,tags_list,time,title,url,ing_str,nouns,lem,trigrams,method
0,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup rice vinegar (120ml), 1/2 cup mirin (...","In a small saucepan, combine soy sauce, vineg...","[asian, chicken, dip, grill, grilling with vin...",45 minutes,Sweet-and-Sour Grilled Chicken Skewers (Yakito...,http://www.seriouseats.com/recipes/2017/08/chi...,1/2 cup rice vinegar (120ml) 1/2 cup mirin (12...,"[rice, vinegar, mirin, soy, sauce, sugar, bone...",1/2 cup rice vinegar ( 120ml ) 1/2 cup mirin (...,kosho thigh togarashi sauce yuzu boneless_skin...,grilling
1,http://www.seriouseats.com/recipes/assets_c/20...,"[1 large tomato, cored, seeded, and cut into 1...",Season tomatoes with 1 teaspoon salt and toss ...,"[brazilian, fish, grilling, south american, vi...",1 hour,Grilled Whole Fish With Molho à Campanha (Braz...,http://www.seriouseats.com/recipes/2017/08/gri...,"1 large tomato, cored, seeded, and cut into 1/...","[tomato, seed, red, onion, green, bell, pepper...","1 large tomato , core , seed , and cut into 1/...",oil fish parsley_leaf flat_leaf white_wine tro...,grilling
2,http://www.seriouseats.com/recipes/assets_c/20...,"[12 ounces freshly ground beef (340g), prefera...","In a medium bowl, gently mix together beef wit...","[beef, burger, grilling, olive, summer, vinegar]",30 minutes,Grilled Black-Olive Burger With Japanese Vineg...,http://www.seriouseats.com/recipes/2017/08/bla...,"12 ounces freshly ground beef (340g), preferab...","[beef, fat, japanese, black, vinegar, chinese,...","12 ounce freshly grind beef ( 340 g ) , prefer...",black beef_fat japanese black_olive brioche ch...,grilling
3,http://www.seriouseats.com/recipes/assets_c/20...,[1 large head cauliflower (about 1 1/2 pounds;...,Preheat oven to 450°F (230°C) and place rack i...,"[almond, cauliflower, indian, raisin, spice, v...",1 hour 15 minutes,Creamy Almond Mughlai Cauliflower Recipe,http://www.seriouseats.com/recipes/2017/08/cre...,1 large head cauliflower (about 1 1/2 pounds; ...,"[cauliflower, vegetable, oil, black, pepper, y...",1 large head cauliflower ( about 1 1/2 pound ;...,yellow_onion chili_flake golden_raisin sliver_...,baking simmering sautéing
4,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup (120ml) cane vinegar, preferably , b...","In a medium saucepan, combine vinegar, soy sau...","[asian, easy, filipino, grill, grilling, grill...",1 hour,Adobo-Marinated Grilled Pork Chops Recipe,http://www.seriouseats.com/recipes/2017/07/ado...,"1/2 cup (120ml) cane vinegar, preferably bra...","[cane, vinegar, soy, sauce, clove, garlic, bla...","1/2 cup ( 120ml ) cane vinegar , preferably ...",clove_garlic sauce bay_leaf cane sticky bone_p...,grilling boiling


In [193]:
df = df_recipes.drop(['instructions', 'tags_list', 'ing_str', 'nouns', 'lem'], 1).reset_index()

In [194]:
df = df.drop('index', 1)

In [143]:
df_recipes.to_pickle('df_recipes')

In [195]:
df.to_pickle('df')

#### Testing out Word2Vec for the ingredients to see similarities. Maybe I can use this in an ingredient recommender

In [197]:
cv = CountVectorizer(strip_accents = 'ascii', stop_words = stop, 
                     lowercase = False)

In [198]:
cv.fit(df['trigrams'])
ingCV = cv.transform(df['trigrams']).transpose()

In [199]:
corpus = matutils.Sparse2Corpus(ingCV)

In [200]:
id2word = dict((v,k) for k, v in cv.vocabulary_.items())

In [201]:
list(id2word.items())[0]

(0, 'acacia')

In [202]:
texts = [[word for word in document.split()] for document in df['trigrams']]

In [203]:
model = Word2Vec(texts, size = 100, window = 5, 
                               min_count = 1, workers = 2, sg = 1)

In [204]:
model.most_similar(['maple', 'bacon', 'flour'], topn = 10)

[('milk_heavy', 0.9873088002204895),
 ('stone', 0.9866076707839966),
 ('golden_raisin', 0.9846048355102539),
 ('whip_cream', 0.9842187166213989),
 ('hazelnut', 0.9840455055236816),
 ('zest_orange', 0.9830451011657715),
 ('crystal', 0.9824039340019226),
 ('butter_cool', 0.9814016819000244),
 ('sheet', 0.9807629585266113),
 ('unbleach', 0.9806784987449646)]

This works well - I'll table it for later

In [46]:
joblib.dump(model, 'wordvec')

['wordvec']

#### Let's do some topic modeling for the ingredients

In [205]:
tf = TfidfVectorizer(strip_accents = 'ascii', stop_words = stop, 
                     lowercase = False)

In [206]:
tfmodel = tf.fit(df['trigrams'])

In [207]:
joblib.dump(tfmodel, 'tfmodel')

['tfmodel']

In [208]:
tfvec = tfmodel.transform(df['trigrams'])

In [209]:
joblib.dump(tfvec, 'tfvec')

['tfvec']

In [210]:
n_components = 10

In [211]:
nmf = NMF(n_components = n_components)
nmf.fit(tfvec)
nmf_tf = nmf.transform(tfvec)

In [212]:
print_top_words(nmf, tfmodel.get_feature_names(), 10)

Topic #0: olive_oil extra_virgin clove_garlic black_pepper juice_lemon flake red_pepper garlic_clove parmesan_cheese drizzle
Topic #1: unsalt_butter purpose_flour bake_powder bake_soda granulate_sugar sugar_egg vanilla_extract brown_sugar baking_powder light_brown
Topic #2: vegetable_oil lime cilantro cilantro_leaf juice_lime lime_juice avocado seed white_onion tomato
Topic #3: black_pepper low_sodium chicken_stock bay_leaf sprig_thyme chicken_broth heavy_cream unsalt_butter white_wine onion
Topic #4: egg butter bacon scallion black_pepper cream cheddar_cheese flour milk vegetable_oil
Topic #5: vinegar apple_cider red_wine worcestershire_sauce brown_sugar ketchup yellow_mustard honey mayonnaise white_wine
Topic #6: pepper bell parsley green onion virgin_olive tomato olive_oil red oil
Topic #7: lemon lemon_juice lemon_zest granulate_sugar egg_yolk zest_juice dijon_mustard garlic_clove juice_zest orange
Topic #8: soy_sauce oil ginger scallion toast_sesame rice_vinegar sesame_oil sauce sh

These look pretty basic - let's expand out

In [213]:
nmf15 = NMF(n_components = 15)
nmf15.fit(tfvec)
nmf_tf15 = nmf.transform(tfvec)

In [214]:
print_top_words(nmf15, tfmodel.get_feature_names(), 10)

Topic #0: olive_oil extra_virgin juice_lemon garlic_clove drizzle black_pepper basil_leaf balsamic_vinegar sherry_vinegar shallot
Topic #1: purpose_flour bake_powder bake_soda sugar_egg baking_powder buttermilk brown_sugar vanilla_extract extract butter_egg
Topic #2: soy_sauce ginger scallion toast_sesame rice_vinegar sesame_oil sauce shaox chinese cornstarch
Topic #3: low_sodium chicken_stock bay_leaf chicken_broth heavy_cream sprig_thyme white_wine black_pepper gold_potato yellow_onion
Topic #4: egg scallion vegetable_oil nutmeg butter_sugar wash bacon salting puff_pastry cream
Topic #5: pepper bell green parsley onion red egg_black tomato olive_oil crush_red
Topic #6: lemon lemon_juice lemon_zest egg_yolk granulate_sugar zest_juice dijon_mustard juice_zest garlic_clove orange
Topic #7: lime cilantro cilantro_leaf vegetable_oil lime_juice juice_lime avocado seed white_onion corn_tortilla
Topic #8: oil virgin_olive vegetable_canola vinegar_extra lemon_extra pepper_extra white_wine can

It looks like things cluster around specific cuisines (Topic 2: Chinese, Topic 7: Mexican, Topic 13: Italian). I can use this for the model filtering later 

**Getting a prediction:**

In [148]:
test = ['kale', 'tomato', 'bread', 'lettuce']

In [161]:
tfvec = joblib.load('data/tfvec')
tfmodel = joblib.load('data/tfmodel')
df = pd.read_pickle('data/df')

In [151]:
testtf = tfmodel.transform(test)

In [162]:
#where i'd do the filtering

df_baking = df[df['method'].str.contains('any')]

In [163]:
similarities = []
for x in df_baking.index:
    similarities.append(np.linalg.norm(pairwise_distances(testtf, tfvec[x], metric = 'cosine')))

In [164]:
df_baking = df_baking.reset_index()

In [165]:
food_sims = sorted([(v,i) for i,v in enumerate(similarities)])[0:5]
index = [m[1] for m in food_sims]
recommendation = df_baking.iloc[index]
print (recommendation.title)

1972                     Sunday Brunch: Eggy Bread Recipe
2802                             Raw Tomato Coulis Recipe
3607    Dinner Tonight: Italian Kale and Farro Soup Re...
3752    Dinner Tonight: Shrimp and Deviled-Egg Salad R...
564                    Ultra-Smashed Cheeseburgers Recipe
Name: title, dtype: object


In [14]:
results_dict = recommendation.to_dict(orient='index')

In [36]:
df = df.drop(df[df['image'].isnull()].index)
df = df.drop(df[df['time'].isnull()].index)

In [43]:
df1[df1['image'].isnull()]

Unnamed: 0,image,ingredients_list,time,title,url,trigrams,method


Dataframe looks good - let's deploy

In [None]:
df.to_pickle('df')

#### Future work: Ingredient Network Analysis, Needed Ingredients