In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

In [20]:
data = pd.read_json("data/train.json")
data = data.set_index("id")
data = data.sort_values("id")
#Lower case all ingredients
data.ingredients = data.ingredients.apply(lambda ings : [ing.lower() for ing in ings])
data.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,spanish,"[mussels, ground black pepper, garlic cloves, ..."
1,mexican,"[tomatoes, diced red onions, paprika, salt, co..."
2,french,"[chicken broth, truffles, pimentos, green pepp..."
3,chinese,"[fresh ginger, sesame oil, frozen peas, cooked..."
4,italian,"[orange peel, cookies, vanilla ice cream, gran..."


In [21]:
#Calculating ingredient co-occurrences
import itertools
from collections import Counter
cooc_counts = Counter()
ing_count = Counter()

for ingredients in data.ingredients:
    for ing in ingredients:
        ing_count[ing] += 1
        for(ing_a, ing_b) in itertools.combinations(set(ingredients), 2):
            #We want the first ingredients to be of higher occurrence in each set
            if(ing_a > ing_b):
                ing_a, ing_b = ing_b, ing_a
            cooc_counts[(ing_a, ing_b)] += 1

In [22]:
cooc_df = pd.DataFrame(((ing_a, ing_b, ing_count[ing_a], ing_count[ing_b], cooc) for (ing_a, ing_b), cooc in cooc_counts.items()), columns=['a', 'b', 'a_count', 'b_count', 'cooc'])
cooc_df.sample(10)

Unnamed: 0,a,b,a_count,b_count,cooc
291611,chinese black vinegar,reduced sodium soy sauce,43,157,11
25390,red pepper flakes,rosemary leaves,669,21,15
337483,chickpeas,tandoori seasoning,402,5,19
143619,jicama,marjoram,88,103,15
27241,fish sauce,unsalted dry roast peanuts,1247,58,375
357806,fresh coriander,vietnamese coriander,285,21,20
354894,shrimp,vermicelli,912,40,28
274748,cooking apples,muscovado sugar,9,21,22
142426,grated lemon peel,ricotta cheese,181,352,59
90429,cayenne pepper,shrimp stock,1523,24,162


In [23]:
#Find ingredients most common with Condensed Milk
cooc_df[cooc_df.a == 'condensed milk'].sort_values('cooc', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc
40206,condensed milk,sugar,74,6434,335
84288,condensed milk,water,74,7457,254
40203,condensed milk,eggs,74,3388,208
71303,condensed milk,vanilla extract,74,1298,200
47909,condensed milk,milk,74,2263,170
86936,condensed milk,salt,74,18049,140
71302,condensed milk,egg yolks,74,542,132
79018,condensed milk,unsalted butter,74,2782,124
86938,condensed milk,flour,74,1348,117
217510,condensed milk,sweet potatoes,74,455,113


In [24]:
#Using Pointwise Mututal Information
# We calculate P(A), P(B) and P(A, B) and PMI(A, B) from the previous df.
# P(A) is counts(A) / num_recipes
# P(A, B) is coocs(A, B) / sum(coocs)
p_a = cooc_df.a_count / sum(ing_count.values())
p_b = cooc_df.b_count / len(ing_count.values())
p_a_b = cooc_df.cooc / cooc_df.cooc.sum()
cooc_df['pmi'] = np.log(p_a_b / (p_a * p_b))

In [25]:
cooc_df.sort_values('pmi', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
155104,adobo style seasoning,breakfast sausage links,1,1,36,7.956245
93753,multi-grain penne pasta,shredded romano cheese,1,2,65,7.853966
406241,truffle butter,veal bones,1,1,28,7.70493
212899,chocolate chip cookie mix,sugar cookie dough,1,2,52,7.630822
478963,hawaiian salt,sliced mango,1,1,25,7.591602
479126,hawaiian salt,raw buckwheat groats,1,1,25,7.591602
478961,raw buckwheat groats,sliced mango,1,1,25,7.591602
314002,dried allspice berries,soup bones,1,1,24,7.55078
423176,buckwheat honey,psyllium husks,1,1,23,7.50822
423158,garbanzo bean flour,psyllium husks,1,1,23,7.50822


In [26]:
min_count = 5
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
194612,kewra water,stone flower,6,6,86,5.243554
155714,herdez salsa casera,herdez salsa verde,5,6,42,4.709198
332638,sazon seasoning,sofrito,5,5,33,4.650357
313701,black rice vinegar,chinese sesame paste,9,5,52,4.517307
34053,mo hanh,vegan mayonnaise,5,6,34,4.497889
194602,shahi jeera,stone flower,20,6,130,4.452768
194603,kewra water,shahi jeera,6,20,128,4.437264
446524,burger buns,sweet yellow corn,5,5,26,4.411946
263403,black fungus,lily buds,5,8,40,4.372726
212555,flat iron steaks,fudge brownie mix,9,6,52,4.334985


In [27]:
min_count = 30
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
54947,brown cardamom,green cardamom,40,86,603,2.631416
57109,brown cardamom,mace,40,81,366,2.19203
31831,gari,wasabi,50,32,169,2.124866
111385,mexican chocolate,plantains,32,38,128,2.121434
264843,baby corn,straw mushrooms,37,31,97,1.902532
26584,asafoetida powder,fresh curry leaves,33,69,185,1.862467
134601,galangal,shrimp paste,85,56,368,1.812806
32124,green cardamom,mace,86,81,529,1.794918
125956,fenugreek,mustard oil,36,32,82,1.73019
31849,sushi rice,wasabi,81,32,180,1.705497


In [28]:
#Pairs with lowest PMI
min_count = 30
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count) & (cooc_df.cooc > 1)].sort_values('pmi', ascending=True).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
414244,fresh lime juice,milk,1368,2263,5,-8.963388
388483,cucumber,unsalted butter,768,2782,6,-8.41023
132995,garlic cloves,vanilla extract,6237,1298,30,-8.132911
350224,onions,vanilla extract,7972,1298,43,-8.018345
312803,dried oregano,ginger,1707,1755,13,-7.975042
382285,grated parmesan cheese,sesame oil,1886,1773,15,-7.941866
161716,dried oregano,fresh ginger,1707,1503,13,-7.820036
436052,all-purpose flour,lemongrass,4632,364,9,-7.767948
396237,ground cinnamon,sesame oil,1231,1773,12,-7.738378
283364,soy sauce,whipping cream,3296,619,12,-7.670938


In [29]:
#Using matrix factorization
from scipy.sparse import csr_matrix
data_df = cooc_df[cooc_df.pmi > 0].copy()
# Since the matrix is symetric, we add the same values for (b,a) as we have for (a,b)
data_df_t = data_df.copy()
data_df.a, data_df.b = data_df.b, data_df.a
data_df = pd.concat([data_df, data_df_t])

rows_idx, row_keys = pd.factorize(data_df.a)
cols_idx, col_keys = pd.factorize(data_df.b)
values = data_df.pmi

matrix = csr_matrix((values, (rows_idx, cols_idx)))
key_to_row = {key: idx for idx, key in enumerate(row_keys)}

In [30]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(200)
factors = svd.fit_transform(matrix)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
def most_similar(ingredient, topn=10):
    if ingredient not in key_to_row:
        print("Unknown ingredient.")
    factor = factors[key_to_row[ingredient]]
    cosines = cosine_similarity([factor], factors)[0]
    indices = cosines.argsort()[::-1][:topn + 1]
    keys = [row_keys[idx] for idx in indices if idx != key_to_row[ingredient]]
    return keys, cosines[indices]

def display_most_similar(ingredient, topn=10):
    print("- Most similar to '{}'".format(ingredient))
    for similar_ing, score in zip(*most_similar(ingredient, topn)):
        print("  . {} : {:.2f}".format(similar_ing, score))    

In [40]:
display_most_similar('liver')

- Most similar to 'liver'
  . pork liver : 1.00
  . curing salt : 0.76
  . miswa : 0.75
  . pork heart : 0.73
  . blood : 0.73
  . sweet pickle : 0.67
  . fully cooked luncheon meat : 0.53
  . sweet pickle juice : 0.51
  . sweet pickle relish : 0.50
  . luncheon meat : 0.48
