# Goal:
Make a function that returns the ingredients which are a common combination with the entered ingredient

In [1]:
import pandas as pd #Data manipulation
import numpy as np #Array and Matrix operations
import matplotlib.pyplot as plt #Plotting
import seaborn as sb #Plotting
sb.set()

# Dataset Preparation
The following steps are taken to prepare the data for EDA as well as model training:
1. The `id` column is set as the index
2. The Dataframe is sorted based on the `id` column
3. We convert all ingredients to lowercase

In [2]:
#DataFrame manipulation
data = pd.read_json("data/train.json")
data = data.set_index("id") 
data = data.sort_values("id")
data.ingredients = data.ingredients.apply(lambda ings : [ing.lower() for ing in ings]) #Lower case all ingredients
data.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,spanish,"[mussels, ground black pepper, garlic cloves, ..."
1,mexican,"[tomatoes, diced red onions, paprika, salt, co..."
2,french,"[chicken broth, truffles, pimentos, green pepp..."
3,chinese,"[fresh ginger, sesame oil, frozen peas, cooked..."
4,italian,"[orange peel, cookies, vanilla ice cream, gran..."


# Ingredient Co-occurrence
The simplest method to recommend ingredients is to see which 2 inredients occur the most together.
We use `itertools.combinations` which goes through an array and makes a combination of 2 ingredients for the entire array

In [3]:
#Calculating ingredient co-occurrences
import itertools
from collections import Counter
cooc_counts = Counter()
ing_count = Counter()

for ingredients in data.ingredients:
    for ing in ingredients:
        ing_count[ing] += 1
    for(ing_a, ing_b) in itertools.combinations(set(ingredients), 2):
        #We want the pairs of ingredients in a consistent order(alphabetically)
        if(ing_a > ing_b):
            ing_a, ing_b = ing_b, ing_a
        cooc_counts[(ing_a, ing_b)] += 1

In [4]:
cooc_df = pd.DataFrame(((ing_a, ing_b, ing_count[ing_a], ing_count[ing_b], cooc) for (ing_a, ing_b), cooc in cooc_counts.items()), columns=['a', 'b', 'a_count', 'b_count', 'cooc'])
cooc_df.sample(10)

Unnamed: 0,a,b,a_count,b_count,cooc
194781,ketchup,lemon slices,432,70,2
10448,chili oil,rice vinegar,109,1204,27
115066,frozen corn,sesame oil,175,1773,4
262239,chicken leg quarters,oyster sauce,25,455,1
212331,asparagus,wonton wrappers,230,158,3
40833,fresh basil,grape tomatoes,1137,228,23
125176,fresh herbs,fresh lime juice,52,1368,3
224088,cold meatloaf,italian seasoning,2,395,1
115263,anaheim chile,chihuahua cheese,51,10,1
435292,sliced mushrooms,vine ripened tomatoes,244,34,1


In [5]:
#Find ingredients most common with Condensed Milk
cooc_df[cooc_df.a == 'condensed milk'].sort_values('cooc', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc
40198,condensed milk,sugar,74,6434,33
84294,condensed milk,water,74,7457,25
40195,condensed milk,eggs,74,3388,19
71299,condensed milk,vanilla extract,74,1298,16
40199,condensed milk,evaporated milk,74,208,13
47910,condensed milk,milk,74,2263,13
71290,condensed milk,egg yolks,74,542,13
166600,condensed milk,whole milk,74,764,9
86947,condensed milk,salt,74,18049,9
81412,condensed milk,whipped cream,74,103,7


As we can see, the most common ingredients are not those which would be common in a recipe with the input ingredient, but are ingredients which are common as a whole in the entire dataset. E.g. Sugar, water, etc.  
___

The clear problem is that co-occurrences are skewed by ingredient frequency in the dataset as mentioned above.

To solve this problem we use something known as ***Pointwise Mutual Information(PMI)*** :
![image.png](attachment:image.png)

**P(X) = Counts(X)/(Total Recipes in Dataset)**  
**P(Y) = Counts(Y)/(Total Recipes in Dataset)**  
**P(X, Y) = Co-occurrence(X, Y)/(Sum of Total Co-occurrences in Dataset)**

The numerator in the formula signifies the **Probability of 2 ingredients occurring together**  
The denominator in the formula signifies the **Probability of 2 ingredients occurring separately**

Therefore, this formula also takes into account the fact that ingredients may be more common but may not occur together.

In [6]:
p_a = cooc_df.a_count / sum(ing_count.values())
p_b = cooc_df.b_count / len(ing_count.values())
p_a_b = cooc_df.cooc / cooc_df.cooc.sum()
cooc_df['pmi'] = np.log(p_a_b / (p_a * p_b))

In [7]:
cooc_df.sort_values('pmi', ascending=False).head(10)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
267740,frozen basil,red wine vinaigrette,1,1,1,7.053491
22282,aloe juice,chartreuse liqueur,1,1,1,7.053491
267635,mccormick taco seasoning,tomato sauce low sodium,1,1,1,7.053491
447410,pork roll,roast pork seasoning mix,1,1,1,7.053491
105853,hickory-flavored liquid smoke,large sausage casing,1,1,1,7.053491
433694,gluten free cooking spray,hurst family harvest chipotle lime black bean ...,1,1,1,7.053491
134880,johnsonville andouille fully cooked sausage,klondike rose red skin potato,1,1,1,7.053491
444778,america,chilcostle chile,1,1,1,7.053491
22279,aloe juice,tonic water,1,1,1,7.053491
464884,braggs liquid aminos,truvã­aâ® natural sweetener,1,1,1,7.053491


We see ingredients which have a very low occurrence in the overall dataset and can be removed  
`Note:` This also shows that instead of just using ingredients as a single entity, we should treat them like n-grams (Olive Oil and Virgin Olive Oil should be treated similarly)

In [8]:
min_count = 5
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
155700,herdez salsa casera,herdez salsa verde,5,6,3,4.750905
176297,black treacle,porridge oats,7,5,3,4.596755
194579,kewra water,stone flower,6,6,3,4.568584
332619,sazon seasoning,sofrito,5,5,2,4.527762
275453,chinese rose wine,maltose,5,9,3,4.34544
54535,rye whiskey,twists,6,5,2,4.34544
322944,aã§ai powder,frozen banana,5,9,3,4.34544
194378,johnsonville andouille,red goldâ® diced tomatoes,6,5,2,4.34544
34054,mo hanh,vegan mayonnaise,5,6,2,4.34544
313725,black rice vinegar,chinese sesame paste,9,5,3,4.34544


Using a minimum criteria of 5 occurrences shows that some ingredients have the brand names mentioned in them (E.g. Herdez Salsa Casera, Shahi Jeera, etc.). The brand names have nothing to do with cuisine and just increase unique ingredient count.

In [9]:
min_count = 30
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count)].sort_values('pmi', ascending=False).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
54940,brown cardamom,green cardamom,40,86,27,2.206101
31855,gari,wasabi,50,32,12,2.160638
104752,dried bonito flakes,konbu,42,66,18,2.016538
60516,bonito flakes,konbu,38,66,16,1.998838
26565,asafoetida powder,fresh curry leaves,33,69,13,1.887826
31811,sushi rice,wasabi,81,32,14,1.832363
36982,coffee granules,coffee liqueur,34,30,5,1.735371
13263,condensed cream of chicken soup,condensed cream of mushroom soup,62,55,16,1.691612
57320,brown cardamom,mace,40,81,15,1.678212
26583,asafoetida powder,dried red chile peppers,33,103,15,1.630304


Using a minimum criteria of 30 occurrences begins to show real relationships

In [10]:
min_count = 30
cooc_df[(cooc_df.a_count >= min_count) & (cooc_df.b_count >= min_count) & (cooc_df.cooc > 1)].sort_values('pmi', ascending=True).head(20)

Unnamed: 0,a,b,a_count,b_count,cooc,pmi
217341,pepper,vanilla extract,4438,1298,2,-7.819901
16599,sesame oil,unsalted butter,1773,2782,2,-7.664716
257328,garlic,powdered sugar,7380,501,2,-7.376497
255708,cooking spray,soy sauce,1490,3296,3,-7.254893
246192,garlic,large garlic cloves,7380,873,4,-7.23868
259894,diced tomatoes,sesame oil,1624,1773,2,-7.126438
167147,buttermilk,soy sauce,863,3296,2,-7.114242
131046,grated parmesan cheese,soy sauce,1886,3296,5,-6.97975
414713,confectioners sugar,garlic cloves,395,6237,2,-6.970503
419455,extra-virgin olive oil,vanilla extract,2747,1298,3,-6.934742


If we ignore those ingredient pairs with just 1 co-occurrence and negative PMI, we see ingredients which don't have any co-occurrence (E.g. Soy Sauce and Whipping Cream).

This is because in the PMI, their denominator was greater than the numerator, making the log negative.  
___

# Matrix Factorisation
We are currently only using direct correlations between 2 ingredients. For example, there are 10 recipes in which `Spaghetti` and `Oregano` appear together. This means we are ignoring more subtle relations. `Parmesan` might not be in many recipes with `Spaghetti` but it could be in many recipes which are closely related to `Spaghetti` as well.

To fix this sparseness situation (where most ingredient pairs have around no correlation ), we use a technique called matric factorisation:
1. Make a matrix where Rows and Columns are all the ingredients and the intersecting values are the PMI values
2. We factorize the matrix - split an NxN matrix into a smaller NxK matrix where K<N (We take K = 120 since it gives good results)

Matrix factorisation removes noisy data and generalises the knowledge to all ingredients. The output is a vector for each ingredient which we can compare with vectors of other ingredients. 

In [11]:
#Using matrix factorization
from scipy.sparse import csr_matrix
data_df = cooc_df[cooc_df.pmi > 0].copy()
# Since the matrix is symetric, we add the same values for (b,a) as we have for (a,b)
data_df_t = data_df.copy()
data_df.a, data_df.b = data_df.b, data_df.a
data_df = pd.concat([data_df, data_df_t])

rows_idx, row_keys = pd.factorize(data_df.a)
cols_idx, col_keys = pd.factorize(data_df.b)
values = data_df.pmi

matrix = csr_matrix((values, (rows_idx, cols_idx)))
key_to_row = {key: idx for idx, key in enumerate(row_keys)}

In [12]:
from sklearn.decomposition import TruncatedSVD

#Using Singular Value Decomposition to reduce dimensionality
svd = TruncatedSVD(200)
factors = svd.fit_transform(matrix)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
def most_similar(ingredient, topn=10):
    if ingredient not in key_to_row:
        print("Unknown ingredient.")
    factor = factors[key_to_row[ingredient]]
    cosines = cosine_similarity([factor], factors)[0]
    indices = cosines.argsort()[::-1][:topn + 1]
    keys = [row_keys[idx] for idx in indices if idx != key_to_row[ingredient]]
    return keys, cosines[indices]

def display_most_similar(ingredient, topn=10):
    print("- Most similar to '{}'".format(ingredient))
    for similar_ing, score in zip(*most_similar(ingredient, topn)):
        print("  . {} : {:.2f}".format(similar_ing, score))    

In [27]:
display_most_similar('naan')

- Most similar to 'naan'
  . cream yogurt : 1.00
  . chickpeas : 0.86
  . basmati rice : 0.84
  . chicken fillets : 0.84
  . natural low-fat yogurt : 0.84
  . flora original : 0.84
  . knorr chicken stock cubes : 0.84
  . foccacia : 0.84
  . fresh ginger root : 0.84
  . fresh coriander : 0.83
