This file creates a function that takes in an ingredient ID and gives a list of the other ingredients in the dataset with the highest cosine similarities with the given ingredient.

Dependencies:
* cooccurrence_matrix.csv
* ppmi.csv
* ingr_map.pkl
* Known Substitutions - Sheet1.csv



In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from heapq import nlargest

In [None]:
# read in the co-occurrence matrix. Make sure you have 'cooccurrence_matrix.csv' downloaded
# and you upload it to this colab environment
cooccurrence_matrix = pd.read_csv('cooccurrence_matrix.csv', index_col = 0)
cooccurrence_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8013,8014,8015,8016,8017,8018,8019,8020,8021,8022
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,1,0
8019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0
8020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8021,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,8,0,0,0


In [None]:
ppmi_matrix = pd.read_csv('ppmi.csv')
ppmi_matrix

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,8013,8014,8015,8016,8017,8018,8019,8020,8021,8022
0,0,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
1,1,0.0,0.00000,12.18912,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
2,2,0.0,12.18912,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
3,3,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
4,4,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8018,8018,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,9.465406,0.000000,0.0,0.932000,0.0
8019,8019,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.852881,0.0
8020,8020,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
8021,8021,0.0,0.00000,0.00000,0.0,0.0,0.0,0.0,3.960569,0.0,...,0.0,0.0,0.116585,0.0,0.0,0.932000,0.852881,0.0,0.000000,0.0


In [None]:
# open the pickle document using the pickle package
# make sure you have 'ingr_map.pkl' uploaded to this colab enirvonment
with open('ingr_map.pkl', 'rb') as f:
    map = pickle.load(f)

In [None]:
# define a function that takes in a co-occurrence matrix and a single ingredient id
# and prints the 5 ingredients with the highest cosine similarity to the given ingredient
def get_cosine_sim_top_5(cooccurrence_matrix, ingredient_id):
  # create an empty dictionary that will store the cosine similarities
  cosine_similarities = {}
  # define an N value for the number of top values we want to print
  N = 5
  # for each ingredient in the co-occurence matrix
  for i in range(cooccurrence_matrix[ingredient_id].size):
    # if the id doesn't match the ingredient ID we're checking (cosine similarity would
    # always be 1.0 in this case)
    if i != int(ingredient_id):
      # add the cosine similarity value to the dictionary with the other ingredient id
      # as the key
      cosine_similarities[str(i)] = cosine_similarity([cooccurrence_matrix[str(i)]],
       [cooccurrence_matrix[ingredient_id]])
  # get the N largest cosine similarities
  results = nlargest(N, cosine_similarities, key=cosine_similarities.get)
  # Printing result
  # print("The top 5 value pairs for " + ingredient_id + " are " + str(results))
  results_dict = {}
  for i in range(N):
    results_dict[results[i]] = cosine_similarities[results[i]][0][0]
  return results_dict

In [None]:
# test case: find the highest cosine similarities for randomly chosen ID 10
top_5_similar = get_cosine_sim_top_5(cooccurrence_matrix, '10')

In [None]:
top_5_similar

{'2276': 0.6854150065397601,
 '7117': 0.680823856725409,
 '7119': 0.6785389502686257,
 '5331': 0.6661004952977292,
 '2568': 0.6520784916843642}

In [None]:
list(top_5_similar.keys())

['2276', '7117', '7119', '5331', '2568']

In [None]:
list(top_5_similar.values())[1]

0.680823856725409

In this case, we can see that the ingredients were not very similar. We can play around with this a little bit more, but after trying a couple, it doesn't seem like this outputs very similar ingredients. This is fine though! This is still a valid finding, we just have to do a little bit more work to actually conclusively verify that the cosine ingredients are/aren't actually similar.

Next, I will create a function that takes in an ingredient ID and outputs the names of the ingredient.

In [None]:
# function that takes in an ingredient ID and returns the name of that ingredient
# as a string
def get_ingr_name(ingr_id):
  return map.loc[map['id'] == int(ingr_id)]['replaced'].iloc[0]

In [None]:
# function that takes in the cooccurrence matrix and an ingredient ID and returns
# the name of the ingredient as a string, the names of the top 5 similar ingredients
# and a list of the cosine similarity scores for those 5 ingredients
def get_ingr_and_similar(cooccurrence_matrix, ingr_id):
  ingr_name = get_ingr_name(ingr_id)
  similar_dict = get_cosine_sim_top_5(cooccurrence_matrix, ingr_id)
  similar_list = list(similar_dict.keys())
  similar_ingr_names = []
  similar_ingr_cosine_sim = []
  for id in similar_list:
    similar_ingr_names.append(get_ingr_name(id))
    similar_ingr_cosine_sim.append(similar_dict.get(id))
  return ingr_name, similar_ingr_names, similar_ingr_cosine_sim

In [None]:
test = get_ingr_and_similar(cooccurrence_matrix, '10')
print(test)

('100 proof vodka', ['dried hibiscus flower', 'tea bag', 'tea leaf', 'peppermint oil', 'family-size tea bag'], [0.6854150065397601, 0.680823856725409, 0.6785389502686257, 0.6661004952977292, 0.6520784916843642])


In [None]:
map.loc[map['replaced'].str.contains('baking powder')]

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
3380,gluten free baking powder,4,baking powder,13,baking powder,15261,332
3381,sodium-free baking powder,3,baking powder,13,baking powder,15261,332
3382,baking powder,2,baking powder,13,baking powder,15261,332
3780,double-acting baking powder,3,double-acting baking powder,27,double-acting baking powder,59,2187
4690,baking powder biscuits,3,baking powder biscuit,21,baking powder biscuit,2,333
4699,featherweight baking powder,3,featherweight baking powder,27,featherweight baking powder,2,2581


In [None]:
map.loc[map['replaced'] == 'Baking mi']

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id


In [None]:
subs = pd.read_csv('Known Substitutions - Sheet1.csv')
subs

Unnamed: 0,Ingredient,Ingredient ID,Substitution,Substitution ID
0,Baking Mix,178,Pancake Mix,1657
1,Baking soda,335,Baking powder,332
2,Broth,773,Bouillon,692
3,Butter,840,Shortening,6473
4,Butter,840,Margarine,4574
5,Butter,840,Vegetable Oil,7557
6,Buttermilk,869,Yogurt,7998
7,Yogurt,7998,Sour Cream,6654
8,White Sugar,7790,Brown Sugar,800
9,Vegetable Oil,7557,Applesauce,170


In [None]:
from statistics import mean

In [None]:
# takes in a comparison matrix and two ingredient IDs and returns the cosine
# similarity between those ingredients and the average of the top 5 cosine
# similarities for both of those ingredients combined
def get_matrix_values(comparison_matrix, ingredient_A_id, ingredient_B_id):
  cosine_sim = cosine_similarity([comparison_matrix[str(ingredient_A_id)]],
   [comparison_matrix[str(ingredient_B_id)]])
  ingredient_A_top_5 = get_ingr_and_similar(comparison_matrix, str(ingredient_A_id))[2]
  ingredient_B_top_5 = get_ingr_and_similar(comparison_matrix, str(ingredient_B_id))[2]
  ingredient_A_top_5_avg = mean(ingredient_A_top_5)
  ingredient_B_top_5_avg = mean(ingredient_B_top_5)
  top_5_average = (ingredient_A_top_5_avg + ingredient_B_top_5_avg) / 2
  return cosine_sim, top_5_average

In [None]:
def compare_subs(ppmi_matrix, cooccurrence_matrix, sub_matrix):
  results = pd.DataFrame(data = {'Ingredient': [],
    'Substitution': [],
    'Cosine_sim_ppmi': [],
    'Avg_top_5_ppmi': [],
    'Cosine_sim_cooccurrence': [],
    'Avg_top_5_cooccurrence': []
    })
  # store results as numpy array instead?
  # results = np.array([])
  for i in range(sub_matrix['Ingredient'].size):
    ingredient_A_id = sub_matrix.iloc[i][1]
    ingredient_B_id = sub_matrix.iloc[i][3]
    cosine_sim_ppmi, avg_top_5_ppmi = get_matrix_values(ppmi_matrix, ingredient_A_id, ingredient_B_id)
    cosine_sim_cooccurrence, avg_top_5_cooccurrence = get_matrix_values(cooccurrence_matrix, ingredient_A_id, ingredient_B_id)
    result = pd.DataFrame(data = {'Ingredient': [sub_matrix.iloc[i][0]],
    'Substitution': [sub_matrix.iloc[i][2]],
    'Cosine_sim_ppmi': [cosine_sim_ppmi],
    'Avg_top_5_ppmi': [avg_top_5_ppmi],
    'Cosine_sim_cooccurrence': [cosine_sim_cooccurrence],
    'Avg_top_5_cooccurrence': [avg_top_5_cooccurrence]
    })
    results = pd.concat([results, result])
  return results





In [None]:
results = compare_subs(ppmi_matrix, cooccurrence_matrix, subs)

In [None]:
results

Unnamed: 0,Ingredient,Substitution,Cosine_sim_ppmi,Avg_top_5_ppmi,Cosine_sim_cooccurrence,Avg_top_5_cooccurrence
0,Baking Mix,Pancake Mix,[[0.012937480368784803]],0.251662,[[0.23904572186687872]],0.522603
0,Baking soda,Baking powder,[[0.6346868273859088]],0.528539,[[0.9162997594414835]],0.933001
0,Broth,Bouillon,[[0.052294186929986095]],0.166949,[[0.8337547854691273]],0.896149
0,Butter,Shortening,[[0.2553173992037207]],0.383992,[[0.895319724696438]],0.945323
0,Butter,Margarine,[[0.3174086000521493]],0.378078,[[0.9780059783875085]],0.935853
0,Butter,Vegetable Oil,[[0.169379747671163]],0.35563,[[0.9202000812260989]],0.951548
0,Buttermilk,Yogurt,[[0.11665648460975936]],0.29841,[[0.7809560716619397]],0.932486
0,Yogurt,Sour Cream,[[0.07023932924137706]],0.312296,[[0.827810641431059]],0.91656
0,White Sugar,Brown Sugar,[[0.3015398532254311]],0.362675,[[0.9191596599069873]],0.948456
0,Vegetable Oil,Applesauce,[[0.09739394674516398]],0.310514,[[0.760695537869387]],0.959776


In [None]:
test = get_ingr_and_similar(cooccurrence_matrix, '1657')[2]
print(test)

[0.6073667251763306, 0.5852763226172619, 0.5686770937278509, 0.56844592134183, 0.564161618840115]


In [None]:
print(mean(test))

0.5787855363406776


In [None]:
test2 = cosine_similarity([cooccurrence_matrix[str('330')]], [cooccurrence_matrix['1657']])
print(test2)

[[0.43531276]]


In [None]:
test3 = cosine_similarity([ppmi_matrix[str('330')]], [ppmi_matrix['1657']])
print(test3)

[[0.07331638]]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = './My Drive/IMT575Final/'
results.to_csv("/content/drive/My Drive/IMT575Final/cosine_comparison.csv")

Mounted at /content/drive
