This file creates a co-occurence matrix between the ingredients in the given recipes list.

In [None]:
#packages
import pickle
import pandas as pd
import copy
import numpy as np
import time


In [None]:
# mount google drive to save files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# save your personal Google Drive path
path = ''

In [None]:
mapping = path + 'ingr_map.pkl'

# open the pickle document using the pickle package
with open(mapping, 'rb') as f:
    map = pickle.load(f)

In [None]:
# create a series of ingredient names (used for later)
unique_ingredient_names = map['replaced'].unique()

#get the unique ids of the replaced values
unique_ingredient_ids = map['id'].unique()

#reorder in asc
unique_ingredient_ids = np.sort(unique_ingredient_ids)

In [None]:
#include only rows with unique values in the 'replaced' column
unique_ingredients = map.drop_duplicates(subset='replaced')

#only the 'replaced' and 'id' columns
unique_ingredients = unique_ingredients[['replaced', 'id']]
unique_ingredients = unique_ingredients.sort_values(by='id')

#just the ids
unique_ingredient_ids = np.array([unique_ingredients['id']])

In [None]:
unique_ingredient_ids

array([[   0,    1,    2, ..., 8020, 8021, 8022]], dtype=int16)

In [None]:
#reads in the cleaned recipe ingredient IDs
recipe_ingredient_ids = pd.read_json(path + 'recipe_ingredient_ids.json', typ='series')

In [None]:
#trivial case to confirm code works
#unique_ingredient_ids = np.array([1,2,3,4,5])
#recipe_ingredient_ids = pd.Series([[2,4,5],[1,2,4]])

In [None]:
#decrease compute time by storing variable
unique_size = unique_ingredient_ids.size
recipe_size = recipe_ingredient_ids.size

#breakpoint only needed for early stopping
break_point =  recipe_size

#create a sparse matrix
coocurrence_matrix = np.zeros((unique_size, unique_size), dtype=int)

#occurence_matrix = pd.DataFrame(columns=unique_ingredient_ids)
start_time = time.time()

#cycles through each recipe in the recipe list
for nth_recipe in range(recipe_size):
    #debug
    #print('looking at the', nth_recipe, ' in the list of recipes sized', recipe_size)

    if nth_recipe % 10000 == 0:
        print("Got to", nth_recipe)
    if nth_recipe == break_point:
        print('reached break point of', break_point,'. Stopping.')
        break

    #saves the single recipe in question
    each_recipe = recipe_ingredient_ids[nth_recipe]
    #debug
    #print('recipe in question is', each_recipe)

    #cycles through each ingredient in the recipe
    for nth_ingredient in range(len(each_recipe)):

      #saves the target ingredient
      target_ingredient = int(each_recipe[nth_ingredient])
      #debug
      #print('target ingredient is', target_ingredient)

      #saves a temporary list of all ingredients in the recipe except the target
      other_ingredients = np.concatenate(( np.array(each_recipe[:nth_ingredient]),np.array(each_recipe[nth_ingredient+1:])))
      #debug
      #print('other ingredients in the recipe are', other_ingredients)

      #cycles through each other ingredient in the reciipe to compare it to the target
      for other_ingredient in other_ingredients:
        other_ingredient = int(other_ingredient)
        #debug
        #print('the specific other ingredient compared to target is:', other_ingredient)

        #increases the value at the intersection of ROW of the other ingrediant and the COLUMN of the target ingredient
        coocurrence_matrix[other_ingredient, target_ingredient] += 1

occurrence_matrix2_runtime = (time.time() - start_time)
print("--- %s seconds ---" % occurrence_matrix2_runtime)

Got to 0
Got to 10000
Got to 20000
Got to 30000
Got to 40000
Got to 50000
Got to 60000
Got to 70000
Got to 80000
Got to 90000
Got to 100000
Got to 110000
Got to 120000
Got to 130000
Got to 140000
Got to 150000
Got to 160000
Got to 170000
--- 44.99012899398804 seconds ---


In [None]:
coocurrence_matrix = pd.DataFrame(coocurrence_matrix)
coocurrence_matrix.to_csv(path + 'cooccurrence_matrix.csv')

In [None]:
coocurrence_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8013,8014,8015,8016,8017,8018,8019,8020,8021,8022
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x = coocurrence_matrix.sum().sort_values()
x = pd.DataFrame(x)
x[x[0] < 100]

Unnamed: 0,0
4036,0
3995,0
3017,0
4450,0
4542,0
...,...
1392,99
2928,99
2073,99
4178,99


In [None]:
recipe_ingredient_ids = recipe_ingredient_ids.apply(lambda x: [int(val) for val in x])
choc_chip = recipe_ingredient_ids[169259]
len(choc_chip) - 1

12