In [8]:
import pandas as pd
import numpy as np
import math
import re
from ingredient_parser.en import parse 
import cPickle as pickle

%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

### Read and process scraped recipes

In [3]:
df = pd.read_csv('recipes.csv', header = 0)

In [4]:
df.head(1)

Unnamed: 0,id,title,recipe_id,desc,by,no_made_it,no_reviews,no_ratings,rating,prep_time,cook_time,ready_in,no_ingre,no_steps,ingre,steps,Cat1,Cat2,Cat3,Cat4
0,1,Fabulous Wet Burritos,70404,"Very easy, yet very good wet burritos. I got ...",Cindy Newell,1077,887,1198,4.5,15 m,30 m,45 m,16,3,"{""1 pound ground beef"",""1/2 cup chopped onion""...","{""Crumble ground beef into a skillet over medi...",World Cuisine,Latin American,Mexican,


In [5]:
df['prep_time_d'] = df['prep_time'].str.extract('(\d+) d', expand=False).astype(np.float64).apply(lambda x: x*24*60).fillna(0)
df['prep_time_h'] = df['prep_time'].str.extract('(\d+) h', expand=False).astype(np.float64).apply(lambda x: x*60).fillna(0)
df['prep_time_m'] = df['prep_time'].str.extract('(\d+) m', expand=False).astype(np.float64).fillna(0)
df['prep_time_total'] =  df['prep_time_d'] + df['prep_time_h'] + df['prep_time_m']

df['cook_time_d'] = df['cook_time'].str.extract('(\d+) d', expand=False).astype(np.float64).apply(lambda x: x*24*60).fillna(0)
df['cook_time_h'] = df['cook_time'].str.extract('(\d+) h', expand=False).astype(np.float64).apply(lambda x: x*60).fillna(0)
df['cook_time_m'] = df['cook_time'].str.extract('(\d+) m', expand=False).astype(np.float64).fillna(0)
df['cook_time_total'] =  df['cook_time_d'] + df['cook_time_h'] + df['cook_time_m']

df['ready_in_d'] = df['ready_in'].str.extract('(\d+) d', expand=False).astype(np.float64).apply(lambda x: x*24*60).fillna(0)
df['ready_in_h'] = df['ready_in'].str.extract('(\d+) h', expand=False).astype(np.float64).apply(lambda x: x*60).fillna(0)
df['ready_in_m'] = df['ready_in'].str.extract('(\d+) m', expand=False).astype(np.float64).fillna(0)
df['ready_in_total'] =  df['ready_in_d'] + df['ready_in_h'] + df['ready_in_m']

df = df.drop('prep_time_d', axis=1)
df = df.drop('prep_time_h', axis=1)
df = df.drop('prep_time_m', axis=1)
df = df.drop('cook_time_d', axis=1)
df = df.drop('cook_time_h', axis=1)
df = df.drop('cook_time_m', axis=1)
df = df.drop('ready_in_d', axis=1)
df = df.drop('ready_in_h', axis=1)
df = df.drop('ready_in_m', axis=1)

df['lg_no_made_it'] = df['no_made_it'].apply(lambda x: math.log(x+1,10))
df['lg_no_reviews'] = df['no_reviews'].apply(lambda x: math.log(x+1,10))
df['lg_no_ratings'] = df['no_ratings'].apply(lambda x: math.log(x+1,10))
df['lg_rating'] = df['rating'].apply(lambda x: math.log(x+1,10))
df['lg_prep_time_total'] = df['prep_time_total'].apply(lambda x: math.log(x+1,10))
df['lg_cook_time_total'] = df['cook_time_total'].apply(lambda x: math.log(x+1,10))
df['lg_ready_in_total'] = df['ready_in_total'].apply(lambda x: math.log(x+1,10))
df['lg_no_ingre'] = df['no_ingre'].apply(lambda x: math.log(x+1,10))
df['lg_no_steps'] = df['no_steps'].apply(lambda x: math.log(x+1,10))

df.head(1)

Unnamed: 0,id,title,recipe_id,desc,by,no_made_it,no_reviews,no_ratings,rating,prep_time,...,ready_in_total,lg_no_made_it,lg_no_reviews,lg_no_ratings,lg_rating,lg_prep_time_total,lg_cook_time_total,lg_ready_in_total,lg_no_ingre,lg_no_steps
0,1,Fabulous Wet Burritos,70404,"Very easy, yet very good wet burritos. I got ...",Cindy Newell,1077,887,1198,4.5,15 m,...,45.0,3.032619,2.948413,3.078819,0.740363,1.20412,1.491362,1.662758,1.230449,0.60206


In [6]:
df[['no_reviews', 'rating', 'no_ingre', 'ready_in_total']].corr(method='pearson')

Unnamed: 0,no_reviews,rating,no_ingre,ready_in_total
no_reviews,1.0,0.172856,0.002607,-0.007702
rating,0.172856,1.0,0.084457,0.014978
no_ingre,0.002607,0.084457,1.0,0.009661
ready_in_total,-0.007702,0.014978,0.009661,1.0


In [9]:
with open('df.p', 'wb') as f:
    pickle.dump(df, f)

In [10]:
import pickle
with open('df.p', 'rb') as f:
    df = pickle.load(f)

In [6]:
#pd.scatter_matrix(df[['no_reviews', 'rating', 'no_ingre', 'ready_in_total']], alpha=0.2, figsize=(6, 6), diagonal='kde')

# a vast majority of recipes were not reviewed at all


##### Parse ingredients

In [11]:
from ingredient_parser.en import parse 
parse(',Batter;')#['measure']

{'measure': ', ', 'name': 'Batter;'}

Generate list of ingredients from recipes

In [12]:
ingredients = df.ingre.values
#for item in ingre_list[:2]:
#    print item

l_ingre = []
counter = 0
for item in ingredients:
    ingre = re.findall(r'(\\?")(.*?)\1',item)
    for i in xrange(len(ingre)):
        if ',' in ingre[i][1]:
            try:
                text = re.findall(r'^(.+?),',ingre[i][1])[0]
            except:
                counter += 1
                #print item
                #print ingre
                #print ingre[i][1]
                #break
        else:
            text = ingre[i][1]
        if ('(' in text) and (')' in text):
            text = re.sub(r'\([^)]*\)', '', text)
        
        l_ingre.append(parse(text)['name'])
print counter

20


In [13]:
l_ingre = list(set(l_ingre))
len(l_ingre)


12740

In [14]:
ingre_df = pd.DataFrame.from_records(
      [[r] for r in l_ingre], 
      columns=["ingredient"])

Augment list of ingredients with wiki ingredients

In [15]:
wiki_ingredients = pd.read_csv('wiki_ingredients.csv', header = 0)
#for item in wiki_ingredients.drop_duplicates().values[:2]:
#    print item
wiki_ingredients = list(set([x[0].lower() for x in wiki_ingredients.drop_duplicates().values]))

#### Map out ingredient space for each recipe

In [16]:
# add ingredients (without measurements) to each recipes
df['ingre_only'] = df['ingre']

In [17]:
ingre_df['found'] = ingre_df['ingredient'].apply(lambda x: 0 if all (ingredient not in x.lower() for ingredient in wiki_ingredients) else 1)

In [18]:
len(ingre_df[ingre_df['found'] == 0])
#len(df)

759

In [19]:
header = ['ingredient', 'found']
ingre_df[ingre_df['found'] == 0].to_csv('ingredient.csv', sep='\t', encoding='utf-8', index = False, columns = header)

In [20]:
len(wiki_ingredients)

447

In [21]:
df_ingre = pd.read_csv('ingre_toparge.csv', header = 0)
df_ingre = df_ingre.drop_duplicates()
df_ingre.head(1)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,recipe_id,ingre1,ingre2,ingre3,ingre4,ingre5,ingre6,ingre7,ingre8,...,ingre58,ingre59,ingre60,ingre61,ingre62,ingre63,ingre64,ingre65,ingre66,ingre67
0,Fabulous Wet Burritos,70404,1 pound ground beef,",",1/2 cup chopped onion,",","1 clove garlic, minced",",",1/2 teaspoon cumin,",",...,,,,,,,,,,


In [22]:
df_ingre = df_ingre.replace(np.nan,' ', regex=True)
df_ingre['parsed'] = df_ingre['ingre1'].apply(lambda x: parse(x)['name'])
for i in xrange(2,68):
    exec ( "df_ingre['parsed'] = df_ingre['ingre" + str(i) + "'].apply(lambda x: parse(str(x))['name']+'; ').str.cat(df_ingre.parsed)")
    exec ("del df_ingre['ingre"+ str(i) +"']")


In [23]:
len(set(x.replace('rries','rry') for x in wiki_ingredients))

447

In [24]:
for item in wiki_ingredients:
    df_ingre[item] = df_ingre['parsed'].apply(lambda x: 1 if item in x else 0)

In [25]:
#from pandas import ExcelWriter
#headers = list(df_ingre)
#df_ingre.to_csv('df_ingre.csv', sep='\t', index = False, columns = headers)
with open('df_ingre.p', 'wb') as f:
    pickle.dump(df_ingre, f)

In [32]:
with open('wiki_ingredients.p', 'wb') as f:
    pickle.dump(wiki_ingredients, f)

#### Calculate [PMI](https://en.wikipedia.org/wiki/Pointwise_mutual_information) for all ingredients

In [26]:
from collections import defaultdict
dict_prob = defaultdict()
for item in wiki_ingredients:
    #print df[item].sum(axis=0)
    dict_prob[item] = float(df_ingre[item].sum(axis=0))/len(df_ingre)

In [27]:
dict_prob = dict((k, v) for k, v in dict_prob.iteritems() if v)
l_prob = [[k,v] for k, v in dict_prob.items()]
l_prob = sorted(l_prob, key = lambda (k, v) : (-v, k))

In [31]:
l_prob[0]

['salt', 0.5606168869834465]

In [28]:
from itertools import combinations
import math
dict_cooccur = defaultdict()
for subset in combinations(wiki_ingredients, 2):
    item1 = subset[0]
    item2 = subset[1]
    if item1 != item2:
        count_cooccur = float(len(df_ingre[(df_ingre[item1] ==1) & (df_ingre[item2]==1)]))/len(df_ingre)
        try:
            ratio = count_cooccur/(dict_prob[item1] * dict_prob[item2])
            if ratio != 0:
                dict_cooccur[subset] = math.log(ratio)
        except:
            pass
        

In [29]:
l_cooccur = [[k,v] for k, v in dict_cooccur.items()]
l_cooccur = sorted(l_cooccur, key = lambda (k, v) : (-v, k))

In [30]:
l_cooccur[0]

[('icing', 'icing sugar'), 9.72877695874351]

In [34]:
len(l_cooccur)
with open('save.p', 'wb') as f:
    pickle.dump(l_cooccur, f)

In [35]:
no = 40
select_ingredient = list(set([item[0] for item in l_prob[:no]]))

In [38]:
l_cooccur_new = []
for item in l_cooccur:
    if (item[0][0] in select_ingredient) and (item[0][1] in select_ingredient):
        l_cooccur_new.append(item)

In [39]:
l_cooccur_new = (sorted(l_cooccur_new, key = lambda (k, v) : (-v, k)))[:251]

In [143]:
#select_ingredient

In [129]:
#read ingredient cluster dictionary
with open('partition.p', 'rb') as f:
    partition = pickle.load(f)

In [144]:
#from collections import OrderedDict
d_index = dict()
i=0
l_node = []
for item in select_ingredient:
        d_group = dict()
        d_group["name"] = item
        d_group["group"] = partition[item]
        l_node.append(d_group) 
        d_index[item] = i
        i += 1

In [134]:
#l = [item[1] for item in l_cooccur_new]
#sorted(l)

In [145]:
l_link = []
for item in l_cooccur_new:
    d_temp = dict()
    big, small = max([d_index[item[0][0]],d_index[item[0][1]]]), min([d_index[item[0][0]],d_index[item[0][1]]])
    d_temp["source"] = int(big)
    d_temp["target"] = int(small)
    d_temp["value"] = int(100*item[1]/35 +1)
    l_link.append(d_temp)

Write to Json format

In [146]:
import json
data = {}
data['nodes'] = l_node
data['links'] = l_link
json_data = json.dumps(data)
json_data

'{"nodes": [{"group": 0, "name": "olive oil"}, {"group": 0, "name": "cheese"}, {"group": 0, "name": "broth"}, {"group": 2, "name": "carrot"}, {"group": 0, "name": "chicken"}, {"group": 1, "name": "milk"}, {"group": 1, "name": "cream"}, {"group": 1, "name": "baking powder"}, {"group": 1, "name": "lemon"}, {"group": 0, "name": "beef"}, {"group": 0, "name": "parsley"}, {"group": 0, "name": "ice"}, {"group": 1, "name": "sugar"}, {"group": 0, "name": "celery"}, {"group": 1, "name": "mix"}, {"group": 2, "name": "vegetable"}, {"group": 2, "name": "sauce"}, {"group": 2, "name": "pea"}, {"group": 0, "name": "tomato"}, {"group": 0, "name": "pepper"}, {"group": 0, "name": "oil"}, {"group": 0, "name": "mushroom"}, {"group": 0, "name": "rum"}, {"group": 1, "name": "flour"}, {"group": 2, "name": "corn"}, {"group": 2, "name": "water"}, {"group": 2, "name": "bean"}, {"group": 0, "name": "garlic"}, {"group": 0, "name": "olive"}, {"group": 0, "name": "bread"}, {"group": 1, "name": "butter"}, {"group": 2

In [147]:
#headers = ['recipe_id']
#for item in wiki_ingredients:
#    headers.append(item)
#df.to_csv('df.csv', sep='\t', encoding='utf-8', index = False, columns = headers)

In [None]:
########################################################
###################### Clustering ######################
########################################################


In [148]:
from sklearn.cluster import KMeans


In [191]:
df_fit = df_ingre.copy()


In [201]:
del df_fit['title']
del df_fit['recipe_id']
del df_fit['ingre1']
del df_fit['parsed']

In [202]:
list(df_fit)
data_fit = [[x[i] for i in xrange(len(x))] for x in df_fit.to_records(index=False)]

In [None]:
#for item in data_fit[:2]:
#print data_fit[:2]

In [203]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(data_fit)


In [204]:
r = kmeans.labels_

In [205]:
len(df_ingre) == len(r)

True

In [206]:
#imported df_ingre again
df_ingre['cluster'] = r
df_ingre.head(1)

Unnamed: 0,recipe_id,ingre1,parsed,fava beans,ale,mortadella,skate,parmesan,passion fruit,milk,...,wonton,caraway,potato,food color,daikon,soy sauce,sesame seeds,beverage,nutritional yeast flakes,cluster
0,70404,1 pound ground beef,; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [207]:
len(df) - (len(df['recipe_id'].drop_duplicates()))

1

In [208]:
len(df) - len(df_left)

0

In [157]:
from bokeh.charts import Bar, output_notebook, show
output_notebook()


In [None]:
#list(df_merged)

In [None]:
# p = Bar(df_merged[df_merged['cluster']==0], label='Cat1', values='cluster', agg='count', title="Cluster 0")
# show(p)

In [None]:
# p = Bar(df_merged[df_merged['cluster']==1], label='Cat1', values='cluster', agg='count', title="Cluster 0")
# show(p)

In [None]:
# p = Bar(df_merged[df_merged['cluster']==2], label='Cat1', values='cluster', agg='count', title="Cluster 0")
# show(p)

In [None]:
#print(df_merged.columns.tolist())

In [None]:
#df_merged[df_merged['cluster']==0]

In [209]:
from sklearn.decomposition import PCA
#from sklearn.datasets import load_iris
#load_iris().data
pca = PCA(n_components=2).fit(data_fit)
pca_2d = pca.transform(data_fit)

In [210]:
df_ingre['PCA1'] = [item[0] for item in pca_2d]
df_ingre['PCA2'] = [item[1] for item in pca_2d]

In [211]:
#list(df_ingre)

In [212]:
df_left = df[['title', 'recipe_id','Cat1','Cat2','Cat3','Cat4']].drop_duplicates()
df_merged = df_left.merge(df_ingre, how = 'inner', on = ['recipe_id'])

In [213]:
len(df_ingre)

16794

In [218]:

import bokeh.plotting as bp
from bokeh.models import HoverTool 
from bokeh.palettes import brewer

colors = brewer["Spectral"][3]

#colors = [color[item] for item in df_ingre['cluster'].tolist()]]

fig = bp.figure(tools="reset,hover")

s1 = fig.scatter(x=df_ingre['PCA1'],y=df_ingre['PCA2'],fill_alpha = 0.4, color=[colors[item] for item in df_ingre['cluster'].tolist()],size=10)
fig.select(dict(type=HoverTool)).tooltips = {"x":"$x", "y":"$y"}
#show(fig)