In [68]:
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

In [41]:
df = pd.read_json('data/unlabelled_recipies.json')

In [42]:
df['number_of_ingredients'] = df['ingredients'].apply(len)

In [43]:
# tokenize each ingredient
tokenizer = RegexpTokenizer(r'\w+')
df['tokenized_ingredients'] = df['ingredients'].apply(str)
df['tokenized_ingredients'] = df['tokenized_ingredients'].apply(tokenizer.tokenize)

In [44]:
df.iloc[0]

ingredients              [pepper, bacon, russet potatoes, sugar, onions...
number_of_ingredients                                                   11
tokenized_ingredients    [pepper, bacon, russet, potatoes, sugar, onion...
Name: 0, dtype: object

In [45]:
# remove numbers but not words with numbers
def remove_numbers(doc):
    return [token for token in doc if not token.isdigit()]
df['tokenized_ingredients'] = df['tokenized_ingredients'].apply(remove_numbers)

In [46]:
# Remove words that are only one character.
def remove_onechar_word(doc):
    return [token for token in doc if len(token) > 3]
df['tokenized_ingredients'] = df['tokenized_ingredients'].apply(remove_onechar_word)

In [47]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(doc):
    return [lemmatizer.lemmatize(token) for token in doc]
df['tokenized_ingredients'] = df['tokenized_ingredients'].apply(lemmatize_tokens)

In [48]:
#Create Biagram & Trigram Models 
from gensim.models import Phrases
bigram = Phrases(df['tokenized_ingredients'], min_count=2)
trigram = Phrases(bigram[df['tokenized_ingredients']], min_count=2)

In [49]:
def get_bigrams_and_trigrams(doc):
    return trigram[doc]
    
df['tokenized_ingredients'] = df['tokenized_ingredients'].apply(get_bigrams_and_trigrams)

In [52]:
df['number_of_ingredients'] = df['tokenized_ingredients'].apply(len)

In [62]:
df[df['number_of_ingredients']<=1]['tokenized_ingredients']

12200                   [water]
12457              [sushi_rice]
12673            [konbu_bonito]
14807                  [butter]
20540            [pastry_dough]
2067                  [grained]
21177           [corn_tortilla]
22908         [unsalted_butter]
23371            [plain_yogurt]
24568                  [phyllo]
27348         [spanish_chorizo]
31568         [unsalted_butter]
34467                  [butter]
38426    [lemonade_concentrate]
451                      [udon]
5162                   [butter]
5615          [unsalted_butter]
7875             [jasmine_rice]
9701                [vegetable]
Name: tokenized_ingredients, dtype: object

In [64]:
# drop recipes with one ingredient
index_to_drop = df[df['number_of_ingredients']<=1].index
df.drop(index_to_drop, inplace=True)

In [132]:
# Create a dictionary representation of the recipe ingredients.
dictionary = Dictionary(df['tokenized_ingredients'])
dictionary.filter_extremes(no_below=50, no_above=0.8)
#Create dictionary and corpus required for Topic Modeling
corpus = [dictionary.doc2bow(doc['tokenized_ingredients']) for i, doc in df.iterrows()]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print(corpus[0])

Number of unique tokens: 820
Number of documents: 39755
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]


In [133]:
# SET LDA parameters
num_topics = 10
chunksize = 500 
passes = 20 
iterations = 400
eval_every = 1 

# Make a index to word dictionary.
temp = dictionary[0] # only to "load" the dictionary.
id2word = dictionary.id2token

#Learn an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [134]:
topics = lda_model.print_topics()
for topic in topics:
    print(topic)
    print('\n')

(0, '0.203*"fresh" + 0.149*"juice" + 0.106*"cilantro" + 0.097*"lemon" + 0.095*"lime" + 0.046*"chopped" + 0.032*"jalapeno_chilies" + 0.025*"orange" + 0.021*"avocado" + 0.018*"peeled"')


(1, '0.138*"olive" + 0.096*"fresh" + 0.050*"wine" + 0.046*"parsley" + 0.038*"cheese" + 0.033*"tomato" + 0.032*"basil" + 0.030*"leaf" + 0.029*"extra_virgin" + 0.028*"clove"')


(2, '0.085*"cinnamon" + 0.072*"ground" + 0.050*"honey" + 0.040*"nutmeg" + 0.032*"sugar" + 0.031*"almond" + 0.030*"plain_yogurt" + 0.029*"brown" + 0.026*"powdered" + 0.024*"light_brown"')


(3, '0.289*"powder" + 0.137*"cumin" + 0.117*"chili" + 0.089*"paprika" + 0.053*"chile" + 0.028*"mirin" + 0.023*"roasted" + 0.021*"dried_oregano" + 0.020*"cayenne" + 0.020*"enchilada"')


(4, '0.076*"buttermilk" + 0.064*"cucumber" + 0.063*"mint" + 0.050*"bread" + 0.046*"yeast" + 0.041*"warm_water" + 0.040*"cornmeal" + 0.037*"apple" + 0.031*"tomatillo" + 0.026*"grape"')


(5, '0.105*"pepper" + 0.077*"onion" + 0.077*"salt" + 0.067*"garlic" + 0.061*"

In [167]:
# get topic number and probability of each topic for each document/recipe
topics = (lda_model.get_document_topics(corpus))

In [175]:
# extract the topic number with highest probabilty for each recipe
topic_num = []
for topic in topics:
    max_topic = 0
    max_val = 0
    for i in range(len(topic)):
        if topic[i][1]>max_val:
            max_val = topic[i][1]
            max_topic = topic[i][0]
    topic_num.append(max_topic)
            
# add topic numbers to pandas dataframe as cuisine number
df['cuisine_num'] = topic_num
df.tail()

Unnamed: 0,ingredients,number_of_ingredients,tokenized_ingredients,cuisine_num
9995,"[large eggs, fresh parsley leaves, heavy cream...",14,"[large, egg, fresh, parsley, leaf, heavy_cream...",7
9996,"[dried oregano, wheat flour, canola oil, olive...",25,"[dried_oregano, wheat, flour, canola, olive, s...",5
9997,"[eggs, green chilies, salt, monterey jack, flo...",8,"[egg, green, chilies, salt, monterey, jack, fl...",7
9998,"['paprika', 'vegetable oil', 'salt', 'ground g...",14,"[paprika, vegetable, salt, ground, ginger, wat...",5
9999,"[apricot nectar, salt, garlic powder, pork cho...",8,"[apricot_nectar, salt, garlic, powder, pork_ch...",5


In [211]:
# Cusine 6 appears to be indian cuisine. Check if the recipes classified as cuisine 6 have indian ingredients
indian_recipes = df[df['cuisine_num']==6]['ingredients']
for i in range(10):
    print('Recipe ' + str(i) + ': ' + ', '.join(indian_recipes.iloc[i]))
    print('\n')

Recipe 0: onions, avocado, salt, coconut cream, cumin seed, tumeric, coconut oil, ginger, red chile powder, daal, chile pepper, garam masala, grated coconut, lime, lime juice, mustard seeds, tomatoes, spinach, garlic, cilantro leaves


Recipe 1: frozen spinach, lemon, tomato paste, tumeric, mustard seeds, sour cream, paprika, salt, cinnamon sticks, zucchini, ground coriander, ground cumin, boneless skinless chicken, tomatoes, minced garlic, chopped cilantro, clove, chutney, grapeseed oil, onions, garam masala, chickpeas, curry leaves, baby potatoes, cumin seed, minced ginger


Recipe 2: szechwan peppercorns, clove, fennel seeds, ground cinnamon, star anise


Recipe 3: cinnamon, coriander seeds, black peppercorns, mace, grated nutmeg, green cardamom, bay leaves, cumin seed, whole cloves, chiles


Recipe 4: chili powder, salt, ground turmeric, cinnamon, mustard seeds, cumin seed, water, tomatoes, bay leaves, onions, coconut, clove, black peppercorns, garam masala, curry leaves, star anis