In [1]:
import pickle
import pandas as pd
import string
import re

# nltk.download()
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv("full_dataset.csv", index_col = 0)
df.info()

  mask |= (ar1 == a)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2231142 entries, 0 to 2231141
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        object
 1   ingredients  object
 2   directions   object
 3   link         object
 4   source       object
 5   NER          object
dtypes: object(6)
memory usage: 119.2+ MB


In [3]:
# select partial dataset before run the full data
df_sample = df.sample(2000)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 1228954 to 873826
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2000 non-null   object
 1   ingredients  2000 non-null   object
 2   directions   2000 non-null   object
 3   link         2000 non-null   object
 4   source       2000 non-null   object
 5   NER          2000 non-null   object
dtypes: object(6)
memory usage: 109.4+ KB


### Text Preprocessing 

In [4]:
df_sample['cleaned_text'] = df_sample['directions'].str.lower()
df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[\d]+'' x ''[\d]+','', x))
df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[\d]+','', x))
df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[\s][u][b]','', x))

df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[[\s]][FfCc][[\s]','', x)) # remove F and C degree letters
df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[\s][\d]+[\s][FfCc]','', x))

df_sample['cleaned_text']

  df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[[\s]][FfCc][[\s]','', x)) # remove F and C degree letters
  df_sample['cleaned_text'] = df_sample['cleaned_text'].map(lambda x: re.sub('[[\s]][FfCc][[\s]','', x)) # remove F and C degree letters


1228954    in large bowl combine flour and sugars cut in ...
1766606    combine rice and water in microwaveable l cass...
702621     put boiling water on top of jello and cream ch...
1531055    in a bowl combine the dry ingredients add the ...
915970     preheat oven to  degrees f  degrees c grease a...
                                 ...                        
1119753    combine all ingredients store in a covered con...
12334      cook spinach according to package directions d...
790209     mix the vegetables miracle whip and cheese tog...
863002     mix all ingredients in one bowl pour into inch...
873826     mix dry ingredients stir in milk and molasses ...
Name: cleaned_text, Length: 2000, dtype: object

In [6]:
# comparision between stemmer and lemmatization 
# stemmer seems too harsh so took lemmatization at this time

porter= PorterStemmer()
lancaster= LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

porter_stem_list = ['eggs','servings','hours','remaining']

for word in porter_stem_list:
    print("porter stemmer:", word,"->", porter.stem(word))
    print("lancaster stemmer:", word,"->", lancaster.stem(word))
    print("lemmatization:", word,"->", wordnet_lemmatizer.lemmatize(word))
    print("----" * 10)
    

porter stemmer: eggs -> egg
lancaster stemmer: eggs -> eg
lemmatization: eggs -> egg
----------------------------------------
porter stemmer: servings -> serv
lancaster stemmer: servings -> serv
lemmatization: servings -> serving
----------------------------------------
porter stemmer: hours -> hour
lancaster stemmer: hours -> hour
lemmatization: hours -> hour
----------------------------------------
porter stemmer: remaining -> remain
lancaster stemmer: remaining -> remain
lemmatization: remaining -> remaining
----------------------------------------


In [5]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#lemmatize and filter a set of pos_tags
def lemmatize_text(text):
    return[lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

df_sample['lemmatized'] = df_sample['cleaned_text'].apply(lemmatize_text)
# df_sample.directions.to_pickle("text_lemmatized.pkl") # save for a later use

df_sample['lemmatized'] = df_sample['lemmatized'].apply(lambda x: " ".join(x))
df_sample['lemmatized']

1228954    in large bowl combine flour and sugar cut in b...
1766606    combine rice and water in microwaveable l cass...
702621     put boiling water on top of jello and cream ch...
1531055    in a bowl combine the dry ingredient add the o...
915970     preheat oven to degree f degree c grease a inc...
                                 ...                        
1119753    combine all ingredient store in a covered cont...
12334      cook spinach according to package direction dr...
790209     mix the vegetable miracle whip and cheese toge...
863002     mix all ingredient in one bowl pour into inch ...
873826     mix dry ingredient stir in milk and molasses a...
Name: lemmatized, Length: 2000, dtype: object

In [7]:
corpus = list(df_sample['lemmatized'])
corpus[:10]

['in large bowl combine flour and sugar cut in butter until mixture resembles coarse crumb set aside cup for topping to the remaining crumb mixture add baking soda and salt beat egg buttermilk and vanilla add to the crumb mixture and mix well pour into a greased xxin baking pan combine candy bar pecan and reserved crumb mixture sprinkle over the top bake atf for minute or until toothpick inserted near the center come out clean',
 'combine rice and water in microwaveable l casserole dish top with fish tomato and dressing cover microwave on high min top with cheese microwave uncovered min or until fish flake easily with fork and rice is tender',
 'put boiling water on top of jello and cream cheese and stir until blended add remaining ingredient chill and serve change with orange jello and orange soda',
 'in a bowl combine the dry ingredient add the oil egg and baby food mix on low speed until well blended stir in pineapple and nut pour into greased and floured in round baking pan bake at

### Base Model (CounterVectorizer and LSA aka SVD)

In [9]:
vectorizer = CountVectorizer(stop_words='english', min_df= 0.1, max_df = 0.8)
doc_word = vectorizer.fit_transform(corpus)
doc_word.shape

(2000, 73)

In [10]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.17356883, 0.08289297, 0.04624084, 0.03653046, 0.03514939])

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)   

topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3", "component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,add,aside,bake,baking,beat,boil,bowl,bring,brown,butter,cheese,combine,cook,cool,cover,cream,cup,cut,degree,dish,drain,dry,egg,flour,garlic,half,heat,hot,hour,inch,ingredient,juice,just,large,let,low,make,medium,melt,milk,minute,mix,mixture,oil,onion,oven,pan,pepper,place,pour,preheat,remaining,remove,salt,sauce,saucepan,serve,serving,set,sheet,simmer,skillet,slice,small,smooth,spoon,spread,sprinkle,stir,stirring,sugar,tender,water
component_1,0.407,0.055,0.105,0.092,0.056,0.083,0.169,0.068,0.071,0.114,0.09,0.085,0.175,0.084,0.092,0.08,0.106,0.059,0.043,0.05,0.049,0.041,0.103,0.086,0.066,0.052,0.278,0.051,0.065,0.076,0.072,0.044,0.049,0.128,0.069,0.038,0.053,0.1,0.033,0.057,0.394,0.124,0.175,0.126,0.102,0.141,0.206,0.121,0.136,0.079,0.056,0.08,0.13,0.144,0.097,0.054,0.081,0.035,0.082,0.058,0.064,0.062,0.041,0.054,0.043,0.043,0.039,0.06,0.162,0.074,0.117,0.046,0.141
component_2,-0.318,0.008,0.209,0.198,0.155,-0.091,0.217,-0.073,0.005,0.12,0.057,0.088,-0.257,0.143,-0.028,0.154,0.107,0.031,0.074,0.012,-0.052,0.042,0.159,0.137,-0.1,0.04,-0.306,-0.028,0.052,0.106,0.077,-0.0,0.024,-0.005,0.025,-0.013,0.047,-0.034,0.009,0.054,-0.046,0.163,0.197,-0.138,-0.149,0.187,0.235,-0.172,0.112,0.073,0.091,0.054,0.011,-0.075,-0.089,-0.017,-0.028,0.002,0.043,0.124,-0.097,-0.085,0.012,0.037,0.051,0.031,0.071,0.041,-0.063,-0.058,0.242,-0.068,-0.073
component_3,0.723,-0.027,0.027,-0.046,0.072,-0.047,0.0,-0.052,-0.006,0.005,-0.012,-0.011,-0.085,-0.009,-0.018,0.041,0.024,-0.003,-0.046,-0.003,0.016,0.075,0.152,0.088,-0.021,-0.017,-0.333,-0.016,0.034,-0.021,0.118,-0.004,0.002,-0.027,-0.03,-0.008,0.004,-0.1,-0.009,0.045,-0.328,0.257,-0.068,-0.046,0.004,-0.096,-0.042,-0.07,-0.12,0.026,-0.044,-0.014,-0.093,-0.017,-0.055,-0.059,-0.004,-0.006,-0.028,-0.068,-0.021,-0.052,-0.029,-0.032,0.018,-0.006,-0.027,-0.051,-0.076,-0.083,0.061,-0.023,0.118
component_4,-0.088,0.007,-0.179,-0.065,0.071,0.175,0.12,0.114,-0.044,-0.013,-0.119,0.049,0.019,0.093,0.011,0.115,0.268,-0.059,-0.067,-0.062,0.052,-0.014,0.061,0.047,-0.106,0.008,0.284,0.009,0.048,-0.0,-0.045,0.014,-0.018,0.04,0.006,0.039,-0.019,0.073,0.012,0.079,-0.293,-0.037,0.315,-0.145,-0.081,-0.288,-0.287,-0.163,-0.033,0.001,-0.102,0.082,0.041,-0.118,-0.063,0.126,-0.015,0.02,0.012,-0.002,0.04,-0.013,-0.056,0.051,0.054,0.048,0.008,-0.036,0.172,0.115,0.222,-0.007,0.282
component_5,-0.021,0.023,-0.115,-0.229,0.013,0.064,-0.063,0.05,-0.017,-0.04,-0.222,-0.029,0.012,0.049,0.053,-0.063,-0.035,-0.016,-0.027,-0.119,-0.025,0.041,-0.011,-0.061,-0.038,-0.012,0.13,0.023,0.084,0.008,0.06,-0.018,0.039,0.035,0.006,0.03,-0.001,0.007,-0.02,-0.05,-0.169,-0.075,-0.172,-0.023,-0.091,-0.043,0.775,-0.12,-0.009,0.019,-0.043,-0.026,0.169,-0.114,-0.002,0.004,0.0,0.002,0.08,-0.15,0.003,-0.047,-0.013,-0.011,0.007,0.006,-0.008,-0.06,-0.041,0.016,0.001,-0.024,0.176


In [13]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [16]:
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic  0
add, minute, heat, pan, mixture, cook, bowl, stir, salt, water, oven, place, remove, large, oil

Topic  1
sugar, pan, bowl, bake, baking, mixture, oven, mix, egg, beat, cream, cool, flour, sheet, butter

Topic  2
add, mix, egg, ingredient, water, flour, dry, beat, sugar, milk, cream, hour, bake, pour, cup

Topic  3
mixture, heat, water, cup, sugar, boil, stir, saucepan, bowl, stirring, cream, bring, cool, remaining, milk

Topic  4
pan, water, remove, heat, hour, set, boil, ingredient, cover, bring, cool, dry, just, large, low


In [15]:
title_per_topic = pd.DataFrame(doc_topic.round(5),
                               index = df_sample.title,
                               columns = ["component_1","component_2","component_3",
                                          "component_4", "component_5"])
title_per_topic.head(25)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Snickers Coffee Cake,4.13656,2.59439,1.23334,0.96186,-1.08007
One-Dish Italian Fish,0.50442,-0.01159,0.0513,0.15317,-0.16505
7-Up Lime Salad,1.11284,-0.14123,0.89435,0.38372,-0.13747
Pineapple Carrot Cake,3.91056,2.77873,0.59584,-0.25981,0.32264
Cinnamon Swirl Bundt Coffee Cake,6.04007,5.33623,0.15688,-0.1437,0.11795
Cinnamon Orange Popovers Recipe,2.30812,0.87826,-0.96801,-0.64577,-0.65351
Orange'S Famous Oatmeal Scotchies!,5.18086,5.86029,-0.51608,1.11439,-2.8915
Irish Coffee Pudding,5.21692,1.43513,1.65243,3.61582,0.0246
Chewy Oatmeal Peanut Butter Bars,5.72928,2.47579,-2.81733,-0.21214,3.41996
Rice Cooker Bacon And Onion Rice,3.97026,-1.43291,0.98016,1.0739,0.53881


> Tuning the model (hyperparameters, pos_tags, remove redundant and unnecessary customized words)

In [18]:
# remove redundant or unhelpful words to define topic boundaries by looking the LSA coefficient 
word_list = ['add','minute','aside','baking','place','combine','cover','let','cup','half','inch','ingredient','mix','mixture',
            'make','place','remaining','spoon','serve','serving','set']
for word in word_list:
    df_sample['lemmatized'] = df_sample['lemmatized'].map(lambda x: re.sub(word,'', x))

In [19]:
pd.set_option('display.max_colwidth', None)
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

def adj_or_not(text):
    pos_list = pos_tag(word_tokenize(text))
    adjs = ''
    for item in pos_list:
        if item[1] == 'NN' or 'VB':
            adjs = adjs + item[0] + ' '
#        elif item[1] == 'VBN' or 'VBG' or 'VBG' or 'VBD':
            adjs = adjs + item[0] + ' '
        elif item[1] == 'JJ':
            adjs = adjs + item[0] + ' '
    return adjs 

df_sample['lemmatized'] = df_sample['lemmatized'].map(adj_or_not)
df_sample['lemmatized'].head()

1228954                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  in in large large bowl bowl flour flour and and sugar sugar cut cut in in butter butter until until ture ture resembles resembles coarse coarse crumb crumb for for topping topping to to the t

In [26]:
corpus = list(df_sample['lemmatized'])

vectorizer = CountVectorizer(stop_words='english', min_df= 0.2, max_df = 0.8)
doc_word = vectorizer.fit_transform(corpus)
doc_word.shape

(2000, 24)

In [27]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.20201286, 0.12733618, 0.06901729, 0.05853261, 0.05643038])

In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)   

topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3", "component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,bake,bowl,brown,butter,cheese,cook,cool,cream,egg,heat,hour,large,oil,onion,oven,pan,pepper,pour,remove,salt,stir,sugar,ture,water
component_1,0.147,0.252,0.102,0.169,0.13,0.25,0.126,0.123,0.154,0.414,0.097,0.19,0.178,0.144,0.206,0.325,0.177,0.116,0.194,0.205,0.242,0.175,0.267,0.199
component_2,0.239,0.21,-0.007,0.134,0.053,-0.345,0.155,0.185,0.191,-0.453,0.061,-0.024,-0.201,-0.194,0.208,0.324,-0.252,0.085,-0.002,-0.131,-0.117,0.282,0.206,-0.074
component_3,-0.037,-0.223,0.011,-0.042,-0.164,0.034,-0.025,-0.178,-0.116,0.032,0.012,-0.021,0.072,-0.039,0.085,0.788,-0.008,-0.011,0.106,-0.033,-0.129,-0.17,-0.421,-0.032
component_4,-0.186,-0.124,-0.048,0.023,-0.386,0.033,0.105,-0.019,0.022,0.368,0.03,-0.082,-0.214,-0.186,-0.303,0.052,-0.426,-0.026,0.124,-0.313,0.174,0.239,0.174,0.232
component_5,-0.142,0.478,-0.026,-0.15,-0.439,-0.124,0.049,-0.198,0.046,-0.218,0.089,0.251,0.138,-0.069,-0.088,-0.069,0.138,-0.014,0.035,0.187,-0.172,0.063,-0.155,0.457


In [29]:
# I removed the necessary but redundant codes to run this cell
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic  0
heat, pan, ture, bowl, cook, stir, oven, salt, water, remove, large, oil, pepper, sugar, butter

Topic  1
pan, sugar, bake, bowl, oven, ture, egg, cream, cool, butter, pour, hour, cheese, remove, brown

Topic  2
pan, remove, oven, oil, cook, heat, hour, brown, pepper, pour, large, cool, water, salt, bake

Topic  3
heat, sugar, water, ture, stir, remove, cool, pan, cook, hour, butter, egg, cream, pour, brown

Topic  4
bowl, water, large, salt, oil, pepper, hour, sugar, cool, egg, remove, pour, brown, pan, onion


In [30]:
title_per_topic = pd.DataFrame(doc_topic.round(5),
                               index = df_sample.title,
                               columns = ["component_1","component_2","component_3",
                                          "component_4", "component_5"])
title_per_topic.head(25)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Snickers Coffee Cake,5.60146,4.26735,-3.09267,0.60474,0.05616
One-Dish Italian Fish,0.65819,-0.04181,-0.3902,-0.30871,0.03742
7-Up Lime Salad,1.38633,0.09523,-1.00541,0.00029,-0.70308
Pineapple Carrot Cake,6.2109,4.86818,-0.23026,-0.26523,-0.21305
Cinnamon Swirl Bundt Coffee Cake,9.72878,8.13508,-0.07465,1.19757,-1.73405
Cinnamon Orange Popovers Recipe,2.52187,0.17612,-0.54333,-0.14194,-0.28442
Orange'S Famous Oatmeal Scotchies!,8.0307,6.59896,-5.49759,-0.11804,0.66392
Irish Coffee Pudding,9.72108,3.71443,-5.65454,4.37076,2.72242
Chewy Oatmeal Peanut Butter Bars,12.01793,5.00046,4.67058,3.26092,-2.87707
Rice Cooker Bacon And Onion Rice,5.90858,-2.9526,0.032,2.07982,-2.05567


> TF-IDF With NMF

In [52]:
# remove redundant or unhelpful words to define topic boundaries by looking the LSA coefficient 
word_list = ['remove','stir','water']
for word in word_list:
    df_sample['lemmatized'] = df_sample['lemmatized'].map(lambda x: re.sub(word,'', x))
    
corpus = list(df_sample['lemmatized'])

In [78]:
tv = TfidfVectorizer(stop_words='english', min_df = .2, max_df = .8)
X = tv.fit_transform(corpus)
tfidf = pd.DataFrame(X.toarray(), columns=tv.get_feature_names())
tfidf

Unnamed: 0,bake,bowl,brown,butter,cheese,cook,cool,cream,egg,heat,hour,large,oil,onion,oven,pan,pepper,pour,salt,sugar,ture
0,0.181268,0.19264,0.0,0.217599,0.0,0.0,0.0,0.0,0.225968,0.0,0.0,0.210579,0.0,0.0,0.0,0.204515,0.0,0.212747,0.191425,0.211407,0.786673
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.686583,0.0,0.0,0.727052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.184705,0.392585,0.0,0.221724,0.232055,0.0,0.486395,0.245733,0.230253,0.0,0.0,0.0,0.22694,0.0,0.0,0.416785,0.0,0.216781,0.0,0.215415,0.200397
4,0.244902,0.130133,0.0,0.293987,0.0,0.0,0.0,0.16291,0.305295,0.127262,0.0,0.142251,0.0,0.0,0.289271,0.414465,0.0,0.287433,0.0,0.428433,0.398564
5,0.466015,0.495252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.484323,0.0,0.0,0.0,0.0,0.550443,0.0,0.0,0.0,0.0,0.0,0.0
6,0.126714,0.269328,0.167627,0.456333,0.0,0.0,0.166842,0.0,0.315924,0.0,0.168742,0.147204,0.0,0.0,0.299342,0.0,0.0,0.0,0.133814,0.295565,0.549919
7,0.0,0.403752,0.0,0.0,0.0,0.0,0.0,0.505445,0.315737,0.263228,0.0,0.0,0.155597,0.0,0.0,0.0,0.0,0.148632,0.0,0.443085,0.412195
8,0.112484,0.119541,0.0,0.405087,0.0,0.0,0.148106,0.0,0.0,0.350709,0.149792,0.130673,0.0,0.0,0.265725,0.634548,0.0,0.0,0.0,0.131187,0.366122
9,0.0,0.0,0.0,0.248352,0.0,0.232366,0.0,0.0,0.0,0.430029,0.0,0.0,0.0,0.77106,0.0,0.233419,0.0,0.0,0.0,0.0,0.224464


In [79]:
from sklearn.decomposition import NMF

nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(X)
topic_word



Unnamed: 0,bake,bowl,brown,butter,cheese,cook,cool,cream,egg,heat,hour,large,oil,onion,oven,pan,pepper,pour,salt,sugar,ture
component_1,1.604,1.3,0.46,1.106,0.0,0.0,1.128,0.626,1.402,0.048,0.822,0.524,0.056,0.0,1.192,1.723,0.0,1.112,0.397,1.766,1.239
component_2,0.0,0.437,0.441,0.274,0.0,1.687,0.027,0.0,0.014,2.082,0.255,0.747,1.197,1.135,0.179,0.22,1.261,0.156,1.003,0.0,0.308
component_3,0.524,0.005,0.103,0.108,2.887,0.0,0.0,1.163,0.0,0.0,0.001,0.0,0.0,0.282,0.273,0.0,0.1,0.131,0.011,0.0,0.373


In [81]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2","component_3","component_4"],
             columns = tv.get_feature_names())

display_topics(nmf_model, tv.get_feature_names(), 10)


Topic  0
pan, sugar, bake, egg, butter, pour, cool, oven, ture, hour

Topic  1
heat, cook, onion, pepper, oil, salt, brown, large, butter, pan

Topic  2
cheese, cream, bake, ture, onion, oven, pour, brown, pepper, butter

Topic  3
bowl, large, ture, salt, oil, oven, hour, pepper, cool, egg


In [83]:
title_per_topic = pd.DataFrame(doc_topic.round(5),
                               index = df_sample.title,
                               columns = ["component_1","component_2","component_3","component_4"])
title_per_topic[0:10]

Unnamed: 0_level_0,component_1,component_2,component_3,component_4
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Snickers Coffee Cake,0.12569,0.0041,0.00694,0.11338
One-Dish Italian Fish,0.0,0.0,0.2831,0.0
7-Up Lime Salad,0.0,0.0,0.27756,0.0
Pineapple Carrot Cake,0.14286,0.0,0.07346,0.11321
Cinnamon Swirl Bundt Coffee Cake,0.19123,0.00999,0.00728,0.0557
Cinnamon Orange Popovers Recipe,0.05429,0.05412,0.01374,0.13524
Orange'S Famous Oatmeal Scotchies!,0.13973,1e-05,0.00025,0.12989
Irish Coffee Pudding,0.0921,0.0278,0.04598,0.11971
Chewy Oatmeal Peanut Butter Bars,0.14213,0.06078,0.0,0.05269
Rice Cooker Bacon And Onion Rice,0.02899,0.18092,0.01392,0.0


- so far, a total of 3 components showed the best boundaries of food types
1. component 1: sweet and deserts
2. component 2: full meals and heftier than component 3
3. component 3: more veggies and less calories than component 2

In [84]:
title_per_topic['meal_types'] = title_per_topic.idxmax(axis=1)

In [101]:
title_per_topic[(title_per_topic['meal_types']=='component_1')
                &(title_per_topic['component_2'] == 0)
                &(title_per_topic['component_3'] == 0)
                &(title_per_topic['component_4'] == 0)].sort_values(by =['component_2'], ascending = False)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,meal_types
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mango and Ham With Balsamic,0.0,0.0,0.0,0.0,component_1
Asian Lettuce Wraps,0.0,0.0,0.0,0.0,component_1
Chocolate Pound Cake,0.07148,0.0,0.0,0.0,component_1
Dijon Vinaigrette,0.0,0.0,0.0,0.0,component_1
Christmas Margaritas,0.09375,0.0,0.0,0.0,component_1
Raspberry Coulis,0.0,0.0,0.0,0.0,component_1
Julie'S Mazola Cake,0.13,0.0,0.0,0.0,component_1
Apricot Nectar Cake,0.13,0.0,0.0,0.0,component_1
Heavenly Brownies,0.1323,0.0,0.0,0.0,component_1
Broccoli-Grape Salad,0.10854,0.0,0.0,0.0,component_1


In [98]:
title_per_topic[(title_per_topic['meal_types']=='component_2')
                &(title_per_topic['component_1'] == 0)
                &(title_per_topic['component_3'] == 0)
                &(title_per_topic['component_4'] == 0)].sort_values(by =['component_2'], ascending = False)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,meal_types
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Home Fries Supremo,0.0,0.26825,0.0,0.0,component_2
Ratatouille,0.0,0.26129,0.0,0.0,component_2
Kenda's Green Beans and Sausage,0.0,0.25029,0.0,0.0,component_2
Lemon Pine Nut Pasta,0.0,0.24993,0.0,0.0,component_2
Spicy Pumpkin & Lentil Soup! (Vegetarian Too!),0.0,0.24623,0.0,0.0,component_2
Mediterranean Fish Soup,0.0,0.24274,0.0,0.0,component_2
Chicken Pot Pie,0.0,0.24249,0.0,0.0,component_2
Tahiri(Aloo Walay Rice),0.0,0.23468,0.0,0.0,component_2
Pork Chops Marsala With Asparagus,0.0,0.2328,0.0,0.0,component_2
Cocktail Wieners,0.0,0.22916,0.0,0.0,component_2


In [99]:
title_per_topic[(title_per_topic['meal_types']=='component_3')
                &(title_per_topic['component_1'] == 0)
                &(title_per_topic['component_2'] == 0)
                &(title_per_topic['component_4'] == 0)].sort_values(by =['component_2'], ascending = False)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,meal_types
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
One-Dish Italian Fish,0.0,0.0,0.2831,0.0,component_3
7-Up Lime Salad,0.0,0.0,0.27756,0.0,component_3
Buffalo Ranch Chicken Panini,0.0,0.0,0.25893,0.0,component_3
Chili Dip,0.0,0.0,0.30696,0.0,component_3
Fried Mexican chicken mac and cheese,0.0,0.0,0.2831,0.0,component_3
Salmon Ball,0.0,0.0,0.27756,0.0,component_3
Yummy Cheese Ball,0.0,0.0,0.30504,0.0,component_3
Sunset Dip,0.0,0.0,0.30504,0.0,component_3
Fruit And Cheese Kabobs,0.0,0.0,0.2831,0.0,component_3
Easy Wrap Sandwich,0.0,0.0,0.2831,0.0,component_3


In [100]:
title_per_topic[(title_per_topic['meal_types']=='component_4')
                &(title_per_topic['component_1'] == 0)
                &(title_per_topic['component_2'] == 0)
                &(title_per_topic['component_3'] == 0)].sort_values(by =['component_2'], ascending = False)

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,meal_types
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mixed Greens with Oranges and Almonds,0.0,0.0,0.0,0.26889,component_4
Amaretto Cocoa Mix,0.0,0.0,0.0,0.27178,component_4
Blood Fruit Salad,0.0,0.0,0.0,0.26889,component_4
Cherry Chiffon Dessert,0.0,0.0,0.0,0.27178,component_4
Low Calorie Trifle,0.0,0.0,0.0,0.26889,component_4
Crunchy Broccoli Salad,0.0,0.0,0.0,0.24405,component_4
Mexican Corn Bread,0.0,0.0,0.0,0.26889,component_4
Cereal Snack Mix,0.0,0.0,0.0,0.27178,component_4
Summer Gazpacho With Garlic Cumin Sauce,0.0,0.0,0.0,0.29464,component_4
Easy Russian Dressing Recipe,0.0,0.0,0.0,0.26889,component_4
