In [5]:
import pandas as pd
import numpy as np
import re
import spacy

In [6]:
recipes = pd.read_csv('RAW_recipes.csv')

In [7]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [8]:
recipes.columns.values

array(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'], dtype=object)

In [9]:
recipes.shape
recipes.isnull().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [10]:
recipes.shape

(231637, 12)

In [11]:
# Dropping the description, contributor_id, submitted cols 
recipes = recipes.drop('description', axis=1)
recipes = recipes.drop('contributor_id', axis=1)
recipes = recipes.drop('submitted', axis=1)
recipes.columns.values


array(['name', 'id', 'minutes', 'tags', 'nutrition', 'n_steps', 'steps',
       'ingredients', 'n_ingredients'], dtype=object)

In [12]:
recipes['ingredients'].head(10)

0    ['winter squash', 'mexican seasoning', 'mixed ...
1    ['prepared pizza crust', 'sausage patty', 'egg...
2    ['ground beef', 'yellow onions', 'diced tomato...
3    ['spreadable cheese with garlic and herbs', 'n...
4    ['tomato juice', 'apple cider vinegar', 'sugar...
5    ['milk', 'vanilla ice cream', 'frozen apple ju...
6    ['fennel seeds', 'green olives', 'ripe olives'...
7    ['pork spareribs', 'soy sauce', 'fresh garlic'...
8    ['chocolate sandwich style cookies', 'chocolat...
9    ['sugar', 'unsalted butter', 'bananas', 'eggs'...
Name: ingredients, dtype: object

In [13]:
recipes['steps'][0]

"['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']"

In [14]:
recipes.shape

(231637, 9)

In [15]:
# Recipe instructions with less than 20 characters are not good recipes
empty_instr_ind = [index for i, index in zip(recipes['steps'], recipes.index) if len(i) < 20]
recipes = recipes.drop(index = empty_instr_ind).reset_index(drop=True)
recipes.shape

(231575, 9)

In [16]:
[i for i in recipes['n_steps'] if i == 0]

[]

### Pre processing data

In [17]:
import ast
print(recipes['ingredients'].size)
tmp = [ast.literal_eval(i) for i in recipes['ingredients']]
# ingredients = [ast.literal_eval(recipes['ingredients'][i] for i in recipes['ingredients'])]
# ingredients 

231575


In [18]:
recipes = recipes.drop('ingredients', axis=1)
recipes['ingredients'] = tmp

In [19]:
recipes['ingredients'][0]

['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [20]:
# Extracting ingredients from their lists and formatting as single strings
recipes['ingredient_text'] = ['; '.join(ingredients) for ingredients in recipes['ingredients']]
recipes['ingredient_text'].head()


0    winter squash; mexican seasoning; mixed spice;...
1    prepared pizza crust; sausage patty; eggs; mil...
2    ground beef; yellow onions; diced tomatoes; to...
3    spreadable cheese with garlic and herbs; new p...
4    tomato juice; apple cider vinegar; sugar; salt...
Name: ingredient_text, dtype: object

In [21]:
all_text = recipes['name'] + ' ' + recipes['ingredient_text'] + ' ' + recipes['steps']
all_text[0]

"arriba   baked winter squash mexican style winter squash; mexican seasoning; mixed spice; honey; butter; olive oil; salt ['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']"

In [22]:
# Clean_text Function
import string
import re

def clean_text(documents):
    cleaned_text = []
    for doc in documents:
        try:
            doc = doc.translate(str.maketrans('', '', string.punctuation)) # Remove Punctuation
            doc = re.sub(r'\d+', '', doc) # Remove Digits
            doc = doc.replace('\n',' ') # Remove New Lines
            doc = doc.strip() # Remove Leading White Space
            doc = re.sub(' +', ' ', doc) # Remove multiple white spaces
            cleaned_text.append(doc)
        except Exception as e:
            print(doc)
    return cleaned_text

# Cleaning Text
cleaned_text = clean_text(all_text)

nan


In [23]:
cleaned_text[1]

'a bit different breakfast pizza prepared pizza crust sausage patty eggs milk salt and pepper cheese preheat oven to degrees f press dough into the bottom and sides of a inch pizza pan bake for minutes until set but not browned cut sausage into small pieces whisk eggs and milk in a bowl until frothy spoon sausage over baked crust and sprinkle with cheese pour egg mixture slowly over sausage and cheese s p to taste bake minutes or until eggs are set and crust is brown'

### Tokenizing Using Spacy

For this tokenization, we will lemmatize the words. This is will help create a denser word embeddings. However, no POS tagging, know entities, or noun_phrases will be parsed and added.

In [24]:
# Testing Strategies and Code
nlp = spacy.load('en_core_web_sm')

' '.join([token.lemma_ for token in nlp(cleaned_text[2]) if not token.is_stop])

'kitchen chili ground beef yellow onion dice tomato tomato paste tomato soup rotel tomatoe kidney bean water chili powder ground cumin salt lettuce cheddar cheese brown ground beef large pot add chop onion ground beef brown sautee wilt add ingredient add kidney bean like bean chili cook slow cooker high hour hour low serve cold clean lettuce shred cheese'

In [25]:
# Tokenizing Function that lemmatizes words and removes Stop Words
def text_tokenizer(documents):
    tokenized_documents = []
    for doc in documents:
        tok_doc = ' '.join([token.lemma_ for token in nlp(doc) if not token.is_stop])
        tokenized_documents.append(tok_doc)
    return tokenized_documents

In [26]:
# Tokenizing Function to run in parallel
def text_tokenizer_mp(doc):
    tok_doc = ' '.join([token.lemma_ for token in nlp(doc) if not token.is_stop])
    return tok_doc

In [23]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  20


In [24]:
# Parallelzing tokenizing process
pool = mp.Pool(mp.cpu_count())
tokenized_text = pool.map(text_tokenizer_mp, [doc for doc in cleaned_text])

KeyboardInterrupt: 

In [None]:
# Save the tokenized_text variable as a csv in order to return to it;
# Do not attempt to run the parser above, it will simply take too long
# Reload the csv from file insted
pd.Series(tokenized_text).to_csv('tokenized_text.csv')

NameError: name 'pd' is not defined

In [1]:
import pandas as pd
tokenized_text = pd.read_csv('tokenized_text.csv')

In [2]:
tokenized_text['tokenized']

0         arriba baked winter squash mexican style winte...
1         bit different breakfast pizza prepared pizza c...
2         kitchen chili ground beef yellow onion dice to...
3         alouette potatoe spreadable cheese garlic herb...
4         amish tomato ketchup can tomato juice apple ci...
                                ...                        
231569    zydeco soup celery onion green sweet pepper ga...
231570    zydeco spice mix paprika salt garlic powder on...
231571    zydeco ya ya devil egg hardcooke egg mayonnais...
231572    cookie design cookie stick butter eagle brand ...
231573    cookie design sugar shortbread cookies granula...
Name: tokenized, Length: 231574, dtype: object

### Creating Word Embeddings

- TF-IDF
- Pre-trained GloVe Word Embeddings
- GloVe Embeddings trained on the recipe corpora

In an attempt to create dense word embeddings, I could find no reliable examples to follow that integrate GloVe or Word2Vec with document topic modeling.

In [3]:
tokenized_text = tokenized_text['tokenized']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase = True,
                            ngram_range = (1,1))

text_tfidf = vectorizer.fit_transform(tokenized_text)
tfidf_words = vectorizer.get_feature_names()
print(text_tfidf.shape)
print(len(tfidf_words))

(231574, 62557)
62557




In [5]:
text_tfidf.shape

(231574, 62557)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components = 50,
          n_jobs = -1,
          max_iter = 100)
text_lda = lda.fit_transform(text_tfidf)
text_lda.shape