In [416]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import nltk, string 
import lda

from pymongo import MongoClient
from textblob import TextBlob
from textblob import Word
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.cluster import KMeans
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from nltk.tokenize import word_tokenize
from collections import defaultdict
from gensim import corpora, models, similarities, matutils
from gensim.models import Word2Vec
import re
import spacy
from spacy.en import English

%matplotlib inline


def set_first_element(df, column):
    """This is for preprocessing - 
    grabbing the first element in a list or setting empty lists to NaN"""
    count = len(df[column])
    for x in range(count):
        try:
            df[column].iloc[x] = df[column].iloc[x][0]
        except:
            df[column].iloc[x] = np.nan

def letters_only(word):
    return word.isalpha()

def lemma(words):    
    sent = []
    token = nlp(words)
    for word in token:
        sent.append(word.lemma_)
    return(" ".join(sent))

def get_nouns_adjs(words):
    """This pulls out any nouns + adjs in the words"""
    nouns_adjs = []
    tags = ['NN', 'NNP', 'NNPS', 'NNS', 'JJ', 'JJS']
    doc = nlp(words)
    for word in doc:
        if word.tag_ in tags and word.text not in stop and letters_only(word.text):
            nouns_adjs.append(word.text)
    return nouns_adjs

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
parser = English()
nlp = spacy.load('en')

In [2]:
client = MongoClient()
db = client.recipes
posts = db.pagesv3

posts.count()

5612

In [331]:
df_recipes = pd.DataFrame(list(posts.find()))
df_recipes.head()

Unnamed: 0,_id,image,ingredients_list,instructions,tags_list,time,title,url
0,59a2335867669764a626b740,[http://www.seriouseats.com/recipes/assets_c/2...,"[1/2 cup rice vinegar (120ml), 1/2 cup mirin (...","In a small saucepan, combine soy sauce, vineg...","[asian, chicken, dip, grill, grilling with vin...","[45 minutes, 45 minutes, plus 8 hours marinati...",[Sweet-and-Sour Grilled Chicken Skewers (Yakit...,http://www.seriouseats.com/recipes/2017/08/chi...
1,59a2335867669764a626b742,[http://www.seriouseats.com/recipes/assets_c/2...,"[1 large tomato, cored, seeded, and cut into 1...",Season tomatoes with 1 teaspoon salt and toss ...,"[brazilian, fish, grilling, south american, vi...","[1 hour, 1 hour, , , or carving fork]",[Grilled Whole Fish With Molho à Campanha (Bra...,http://www.seriouseats.com/recipes/2017/08/gri...
2,59a2335867669764a626b744,[http://www.seriouseats.com/recipes/assets_c/2...,"[12 ounces freshly ground beef (340g), prefera...","In a medium bowl, gently mix together beef wit...","[beef, burger, grilling, olive, summer, vinegar]","[30 minutes, 1 hour 30 minutes, , ]",[Grilled Black-Olive Burger With Japanese Vine...,http://www.seriouseats.com/recipes/2017/08/bla...
3,59a2335967669764a626b746,[http://www.seriouseats.com/recipes/assets_c/2...,[1 large head cauliflower (about 1 1/2 pounds;...,Preheat oven to 450°F (230°C) and place rack i...,"[almond, cauliflower, indian, raisin, spice, v...","[1 hour 15 minutes, 1 hour 15 minutes, , , or ]",[Creamy Almond Mughlai Cauliflower Recipe],http://www.seriouseats.com/recipes/2017/08/cre...
4,59a2335967669764a626b748,[http://www.seriouseats.com/recipes/assets_c/2...,"[1/2 cup (120ml) cane vinegar, preferably , b...","In a medium saucepan, combine vinegar, soy sau...","[asian, easy, filipino, grill, grilling, grill...","[1 hour, 1 hour plus 8 hours marination time, , ]",[Adobo-Marinated Grilled Pork Chops Recipe],http://www.seriouseats.com/recipes/2017/07/ado...


In [332]:
df_recipes.tail()

Unnamed: 0,_id,image,ingredients_list,instructions,tags_list,time,title,url
5607,59a2410b67669764a626efa5,[http://www.seriouseats.com/recipes/assets_c/2...,"[4 thin, square slices white bread, 1 1/2 tabl...",Turn on the oven to 500°F. Butter each slice ...,"[cheese, Cook the Book, CTB: Alone in the Kitc...",[],[Cook the Book: Il Tost (Grilled Ham and Cheese)],http://www.seriouseats.com/recipes/2007/08/il-...
5608,59a2410b67669764a626efa7,[http://www.seriouseats.com/recipes/assets_c/2...,"[12 sage leaves, 2 tablespoons olive oil, 4 sl...",Roughly chop the sage and toss it into a small...,"[Dinner Tonight, grilled cheese, sage]",[],[Dinner Tonight: Sage Grilled Cheese Recipe],http://www.seriouseats.com/recipes/2007/08/din...
5609,59a2410c67669764a626efa9,[http://www.seriouseats.com/recipes/assets_c/2...,[1 French baguette or 4 submarine rolls Tuna S...,Preheat oven to 375°F Split the baguette open...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",[],[Cook the Book: Jersey Shore Tuna Sub],http://www.seriouseats.com/recipes/2007/06/coo...
5610,59a2410c67669764a626efab,[http://www.seriouseats.com/recipes/assets_c/2...,"[4 New Englandstyle hot dog buns , 4 tablespo...",Heat a lO-inch skillet over medium heat. Sprea...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",[],[Cook the Book: Classic Maine Lobster Rolls],http://www.seriouseats.com/recipes/2007/06/cla...
5611,59a2410c67669764a626efae,[http://www.seriouseats.com/recipes/assets_c/2...,[],,"[Cook the Book, cookbooks, cooking tips, CTB: ...",[],[Cook the Book: Grilled Cheese Sandwich],http://www.seriouseats.com/recipes/2007/05/coo...


In [333]:
df_recipes = df_recipes.drop('_id', 1)

In [334]:
len(df_recipes['title'][0])

1

In [335]:
set_first_element(df_recipes, 'title')

In [336]:
set_first_element(df_recipes, 'image')

In [337]:
set_first_element(df_recipes, 'time')

In [338]:
df_recipes.tail()

Unnamed: 0,image,ingredients_list,instructions,tags_list,time,title,url
5607,http://www.seriouseats.com/recipes/assets_c/20...,"[4 thin, square slices white bread, 1 1/2 tabl...",Turn on the oven to 500°F. Butter each slice ...,"[cheese, Cook the Book, CTB: Alone in the Kitc...",,Cook the Book: Il Tost (Grilled Ham and Cheese),http://www.seriouseats.com/recipes/2007/08/il-...
5608,http://www.seriouseats.com/recipes/assets_c/20...,"[12 sage leaves, 2 tablespoons olive oil, 4 sl...",Roughly chop the sage and toss it into a small...,"[Dinner Tonight, grilled cheese, sage]",,Dinner Tonight: Sage Grilled Cheese Recipe,http://www.seriouseats.com/recipes/2007/08/din...
5609,http://www.seriouseats.com/recipes/assets_c/20...,[1 French baguette or 4 submarine rolls Tuna S...,Preheat oven to 375°F Split the baguette open...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",,Cook the Book: Jersey Shore Tuna Sub,http://www.seriouseats.com/recipes/2007/06/coo...
5610,http://www.seriouseats.com/recipes/assets_c/20...,"[4 New Englandstyle hot dog buns , 4 tablespo...",Heat a lO-inch skillet over medium heat. Sprea...,"[Cook the Book, cookbooks, CTB: Summer Shack, ...",,Cook the Book: Classic Maine Lobster Rolls,http://www.seriouseats.com/recipes/2007/06/cla...
5611,http://www.seriouseats.com/recipes/assets_c/20...,[],,"[Cook the Book, cookbooks, cooking tips, CTB: ...",,Cook the Book: Grilled Cheese Sandwich,http://www.seriouseats.com/recipes/2007/05/coo...


In [339]:
df_recipes['ing_str'] = df_recipes['ingredients_list'].apply(lambda x: " ".join(x))

In [340]:
df_recipes['ing_str']

0       1/2 cup rice vinegar (120ml) 1/2 cup mirin (12...
1       1 large tomato, cored, seeded, and cut into 1/...
2       12 ounces freshly ground beef (340g), preferab...
3       1 large head cauliflower (about 1 1/2 pounds; ...
4       1/2 cup (120ml) cane vinegar, preferably   bra...
5       1/2 cup (100g) sugar 1 tablespoon (15g) Diamon...
6       8 ounces (225g) asparagus,  , stalks peeled if...
7       1/2 pound string beans (225g), cut on a sharp ...
8       2 medium yellow onions (about 12 ounces; 340g)...
9       8 ounces (225g) bleached honeycomb tripe, cut ...
10      Vegetable, canola, or peanut oil, for frying a...
11      Peanut, vegetable, or canola oil, for frying 1...
12      2 tablespoons (30ml) extra-virgin olive oil, p...
13      1 teaspoon baking soda 1 1/2 pounds (680g) thi...
14      2 teaspoons paprika 1/2 teaspoon dried oregano...
15      1 pound (450g) red kidney beans  Kosher salt 1...
16      1 pound (450g) lump blue crabmeat, picked over...
17      12 oun

In [341]:
df_recipes['ing_str'] = df_recipes['ing_str'].apply(lambda x: x.lower())

In [None]:
df_recipes

In [464]:
stop = stopwords.words('English')
stop += ['teaspoon', 'tablespoon', 'cup', 'medium', 'large', 'torn',
         'pound', 'serving', 'note', 'pinch', 'salt', 'black pepper', 
         'piece', 'cut', 'head', 'leaves', 'ground', 'tender', 
         'split', 'lengthwise', 'water', 'warm', 'whole', 'diced'
         'bowl', 'kosher', 'hand', 'recipe', 'ounce', 'gram', 'dice', 
         'head', 'g', 'sea salt', 'mince', 'slice', 'drain', 'chill',
        'floret', 'half', 'quarter', 'homemade', 'garnish', 'brand', 
         'inch', 'thick', 'warm', 'cold', 'diamond crystal', 'clean',
         'plenty', 'chunk', 'total', 'optional', 'bowl', 'thin', 
         'julienned', 'bias', 'sturdy', 'fresh', 'store', 'round', 
         'knob', 'cube', 'cubed', 'necessary', 'reserve', 'mixed', 
         'shredded', 'size', 'disk', 'stalk', 'package', 'direction',
        'able', 'acceptable', 'accompaniment', 'high_quality', 'handful',
         'additional', 'small', 'inner', 'intact', 'trim', 'square', 
         'scale','quart', 'halves', 'oz', 'day', 'old', 'possible', 'part', 'accord',
        'gutt', 'wedge', 'bunch', 'slice', 'grate', 'microplane', 'ripe',
        'core', 'branch', 'debone', 'moist', 'edible', 'thaw', 'freeze', 
         'box','melt', 'organic', 'grind', 'fire', 'unpeel', 'removed', 
         'herb', 'knife', 'pith', 'remove', 'paper', 'section', 'pit', 
         'strip', 'scrub', 'hard_boil']




In [465]:
print(get_nouns_adjs(df_recipes['ing_str'][2368]))

['wheat', 'bran', 'raisins', 'brandy', 'buttermilk', 'brown', 'sugar', 'vegetable', 'oil', 'egg', 'egg', 'white', 'flour', 'wheat', 'flour', 'baking', 'powder', 'baking', 'soda']


In [466]:
df_recipes['lem'] = df_recipes['ing_str'].apply(lambda x: lemma(x))

In [467]:
df_recipes['nouns'] = df_recipes['lem'].apply(lambda x: get_nouns_adjs(x))

In [468]:
nphrases = Phrases(df_recipes['nouns'])

In [469]:
bigram = Phraser(nphrases)

In [470]:
print(bigram[df_recipes['nouns'][5014]])

['tomato', 'rough', 'cucumber', 'peel', 'seed', 'rough', 'red', 'onion', 'peel', 'rough', 'green', 'red_bell', 'pepper', 'seed', 'rough', 'clove_garlic', 'peel', 'white', 'sandwich', 'french_italian', 'bread_crust', 'rough', 'extra_virgin', 'olive_oil', 'serve', 'sherry_vinegar', 'serve', 'chive', 'black', 'pepper']


In [471]:
trigram = Phrases(bigram[df_recipes['nouns']])

In [472]:
print(trigram[df_recipes['nouns'][5123]])

['chicken_stock', 'egg', 'semolina', 'parmesan_cheese', 'nutmeg', 'black_pepper', 'baby_spinach']




In [473]:
df_recipes['trigrams'] = df_recipes['nouns'].apply(lambda x: " ".join(trigram[x]))



In [474]:
df_recipes.head()

Unnamed: 0,image,ingredients_list,instructions,tags_list,time,title,url,ing_str,nouns,lem,trigrams
0,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup rice vinegar (120ml), 1/2 cup mirin (...","In a small saucepan, combine soy sauce, vineg...","[asian, chicken, dip, grill, grilling with vin...",45 minutes,Sweet-and-Sour Grilled Chicken Skewers (Yakito...,http://www.seriouseats.com/recipes/2017/08/chi...,1/2 cup rice vinegar (120ml) 1/2 cup mirin (12...,"[rice, vinegar, mirin, soy, sauce, sugar, bone...",1/2 cup rice vinegar ( 120ml ) 1/2 cup mirin (...,rice_vinegar mirin soy_sauce sugar boneless_sk...
1,http://www.seriouseats.com/recipes/assets_c/20...,"[1 large tomato, cored, seeded, and cut into 1...",Season tomatoes with 1 teaspoon salt and toss ...,"[brazilian, fish, grilling, south american, vi...",1 hour,Grilled Whole Fish With Molho à Campanha (Braz...,http://www.seriouseats.com/recipes/2017/08/gri...,"1 large tomato, cored, seeded, and cut into 1/...","[tomato, seed, red, onion, diced, green, bell,...","1 large tomato , core , seed , and cut into 1/...",tomato seed red onion diced green bell pepper ...
2,http://www.seriouseats.com/recipes/assets_c/20...,"[12 ounces freshly ground beef (340g), prefera...","In a medium bowl, gently mix together beef wit...","[beef, burger, grilling, olive, summer, vinegar]",30 minutes,Grilled Black-Olive Burger With Japanese Vineg...,http://www.seriouseats.com/recipes/2017/08/bla...,"12 ounces freshly ground beef (340g), preferab...","[beef, fat, japanese, black, vinegar, chinese,...","12 ounce freshly grind beef ( 340 g ) , prefer...",beef fat japanese black vinegar chinese vinega...
3,http://www.seriouseats.com/recipes/assets_c/20...,[1 large head cauliflower (about 1 1/2 pounds;...,Preheat oven to 450°F (230°C) and place rack i...,"[almond, cauliflower, indian, raisin, spice, v...",1 hour 15 minutes,Creamy Almond Mughlai Cauliflower Recipe,http://www.seriouseats.com/recipes/2017/08/cre...,1 large head cauliflower (about 1 1/2 pounds; ...,"[cauliflower, vegetable, oil, black, pepper, y...",1 large head cauliflower ( about 1 1/2 pound ;...,cauliflower vegetable_oil black_pepper yellow_...
4,http://www.seriouseats.com/recipes/assets_c/20...,"[1/2 cup (120ml) cane vinegar, preferably , b...","In a medium saucepan, combine vinegar, soy sau...","[asian, easy, filipino, grill, grilling, grill...",1 hour,Adobo-Marinated Grilled Pork Chops Recipe,http://www.seriouseats.com/recipes/2017/07/ado...,"1/2 cup (120ml) cane vinegar, preferably bra...","[cane, vinegar, soy, sauce, clove, garlic, bla...","1/2 cup ( 120ml ) cane vinegar , preferably ...",cane vinegar soy_sauce clove_garlic black_pepp...


In [475]:
cv = CountVectorizer(strip_accents = 'ascii', stop_words = stop, 
                     lowercase = False)

In [476]:
cv.fit(df_recipes['trigrams'])
ingCV = cv.transform(df_recipes['trigrams']).transpose()



In [477]:
corpus = matutils.Sparse2Corpus(ingCV)



In [478]:
id2word = dict((v,k) for k, v in cv.vocabulary_.items())



In [479]:
list(id2word.items())[0]



(2929, 'rice_vinegar')

In [480]:
lda = models.LdaModel(corpus=corpus, 
                      num_topics=10, id2word=id2word, passes=10)

In [481]:
lda.print_topics(num_words = 7)


[(0,
  '0.038*"black_pepper" + 0.033*"olive_oil" + 0.029*"dry" + 0.026*"onion" + 0.024*"clove_garlic" + 0.016*"parsley" + 0.016*"red"'),
 (1,
  '0.038*"egg" + 0.032*"black_pepper" + 0.031*"butter" + 0.028*"milk" + 0.026*"cheese" + 0.024*"bacon" + 0.019*"white"'),
 (2,
  '0.029*"lemon" + 0.029*"juice" + 0.021*"juice_lemon" + 0.019*"apple" + 0.019*"peel" + 0.017*"sugar" + 0.016*"granulate_sugar"'),
 (3,
  '0.056*"powder" + 0.043*"vinegar" + 0.031*"sugar" + 0.029*"pepper" + 0.021*"apple_cider" + 0.020*"garlic" + 0.018*"cayenne"'),
 (4,
  '0.050*"olive_oil" + 0.041*"extra_virgin" + 0.026*"cheese" + 0.023*"black_pepper" + 0.018*"basil_leaf" + 0.013*"mozzarella" + 0.013*"drizzle"'),
 (5,
  '0.067*"pepper" + 0.042*"red" + 0.037*"onion" + 0.029*"olive_oil" + 0.024*"chop" + 0.023*"clove_garlic" + 0.021*"tomato"'),
 (6,
  '0.028*"white" + 0.027*"soy_sauce" + 0.025*"ginger" + 0.021*"scallion" + 0.019*"sugar" + 0.019*"peel" + 0.018*"oil"'),
 (7,
  '0.097*"sugar" + 0.045*"egg" + 0.044*"purpose_flou

In [482]:
texts = [[word for word in document.split()] for document in df_recipes['trigrams']]

In [483]:
model = Word2Vec(texts, size = 100, window = 5, 
                               min_count = 1, workers = 2, sg = 1)

In [484]:
model.most_similar(['cilantro', 'spinach', 'pepper', 'egg', 'bacon'], topn = 10)


[('crusty_roll', 0.9489260911941528),
 ('saltine', 0.9464064836502075),
 ('culantro', 0.9456775784492493),
 ('potato_scrub', 0.9447228908538818),
 ('bulgarian', 0.9439395070075989),
 ('couscous', 0.9433668851852417),
 ('snip', 0.941195011138916),
 ('hard_boil', 0.9408521056175232),
 ('bread_crust', 0.9407138824462891),
 ('feta', 0.9403773546218872)]

In [None]:
try on the process

take favorite recipe in conjunction

user adds in a wildcard component - have a word count of all the words that modifies

