In [1]:
import numpy as np
import json
import random
import sklearn.cluster as sk_cluster
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score

In [2]:
global ingredients_sort # list with sorted pairs of kay,values in descending order according to value
global ingredients_dict_index
global ingredients_dict_count
global subset # list of recipes dicts, with missing ingredient recipes
global unique_ingredients # ingredients list
global original_recipes # 1000 recipes in original form, these recipes miss an ingredient in M
global jaccard_recipes # jaccard score matrix for recipes

## Create the data

In [3]:
f = open("train.json")
data = json.load(f)
len(data)

39774

Keep only italian and non empty ingresients lists

In [4]:
subset = [i for i in data if i["cuisine"]=="italian" and
          not i["ingredients"] is None]

Make a list with all the ingredients in our dataset (only one entry for every ingredient)

In [5]:
ingredients_list = [np.array(i["ingredients"]) for i in subset]
unique_ingredients = np.concatenate(ingredients_list)
unique_ingredients = np.sort(np.unique(unique_ingredients))

Pick 1000 random indexes to remove a single ingredient from each recipe that points

In [6]:
random_indexes = random.sample(range(0, len(subset)-1), 1000)

In [7]:
original_recipes = []
for i in random_indexes:
    # Copy the recipe and sort the ingredients list
    temp = subset[i].copy()
    temp["index"] = i
    temp["ingredients"] = np.sort(temp["ingredients"])
    # Save the original recipe and its index
    original_recipes.append(temp)
    # Shuffle the ingredients and remove the last and sort them again
    random.shuffle(subset[i]["ingredients"])
    subset[i]["ingredients"] = np.sort(subset[i]["ingredients"][:-1])
    
    

print(original_recipes[0]["ingredients"])
print(subset[original_recipes[0]["index"]]["ingredients"])

['fresh green bean' 'garlic' 'grated parmesan cheese' 'olive oil'
 'penne pasta' 'pepper' 'pesto' 'plain yogurt' 'red potato' 'salt']
['fresh green bean' 'grated parmesan cheese' 'olive oil' 'penne pasta'
 'pepper' 'pesto' 'plain yogurt' 'red potato' 'salt']


Create italian_recipes * unique_ingredients matrix (This is M matrix that is created from the subset that we removed the ingredients from 1000 recipes)

In [8]:
M = np.zeros((len(subset), len(unique_ingredients)))

for x, r in enumerate(subset):
    for i in r["ingredients"]:
        y = np.where(unique_ingredients == i)
        M[x, y] = 1

In [9]:
len(random_indexes)

1000

In [10]:
R = np.zeros((len(random_indexes), len(unique_ingredients)))
for k, x in enumerate(random_indexes):
    for i in subset[x]["ingredients"]:
        y = np.where(unique_ingredients == i)
        R[k, y] = 1


In [11]:
R.shape

(1000, 2929)

Create a dictionary with the ingredients frequencies.

In [12]:
ingredients_dict_count = {}

for r in subset:
    for i in r["ingredients"]:
        if i in ingredients_dict_count.keys():
            ingredients_dict_count[i] += 1
        else:
            ingredients_dict_count[i] = 1



In [13]:
ingredients_dict_index = {}
for i, obj in enumerate(unique_ingredients):
    ingredients_dict_index[obj] = i

Create a sorted by value list to store  (key, frequency) in descending order.

In [14]:
ingredients_sort = sorted(ingredients_dict_count.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)

Jaccard Similarity Matrix (Recipes)

In [15]:
jaccard_recipes = metrics.pairwise.pairwise_distances(R,M,metric = 'jaccard')
jaccard_recipes = np.subtract(np.ones((R.shape[0],M.shape[0])), jaccard_recipes)
jaccard_recipes = np.absolute(jaccard_recipes)



Jaccard Similarity Matrix (Ingredients)

In [46]:
def DistJaccard(str1, str2):
    str1 = set(str1.split())
    str2 = set(str2.split())
    return float(len(str1 & str2)) / len(str1 | str2)

In [51]:
jaccard_ingredients = np.zeros((len(unique_ingredients), len(unique_ingredients)))
for x,i in enumerate(unique_ingredients):
    for y,j in enumerate(unique_ingredients):
        jaccard_ingredients[x,y] = DistJaccard(i, j)
jaccard_ingredients

array([[1.   , 0.6  , 0.125, ..., 0.   , 0.   , 0.   ],
       [0.6  , 1.   , 0.125, ..., 0.   , 0.   , 0.   ],
       [0.125, 0.125, 1.   , ..., 0.   , 0.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , ..., 1.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 1.   , 0.5  ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.5  , 1.   ]])

## Score Formulas

### General purpose functions

In [16]:
def score(r, i , formula):
    
    if formula == "mp":
        return mp(r,i)
        # call most popular
    elif formula == "ucf":
        return 1
        # call micf
    elif formula == "icf":
        return 2
        # call icf
    else:
        print("Wrong formula input.")

Returns the missing ingredient of the given recipe.

In [17]:
def get_missing_ingredient(r):
    for k, i in enumerate(subset[original_recipes[r]["index"]]["ingredients"]):
        
        if i != original_recipes[r]["ingredients"][k]:
            return original_recipes[r]["ingredients"][k]
    return original_recipes[r]["ingredients"][-1]

Returns the indexes of the top-N values of in_array.

In [18]:
def get_top_N(in_array, N):
    args = np.argsort(in_array)
    args = list(reversed(args[-N:]))
    return args

### Most Popular Ingredient (MP)

In [19]:
def mp(r, i):
    for p in ingredients_sort:
        if not p[0] in subset[original_recipes[r]["index"]]["ingredients"]:
            print("Expecting: ", get_missing_ingredient(r))
            print("Returned:  ", p[0])
            return p

In [20]:
mp(100, "salt")

Expecting:  extra-virgin olive oil
Returned:   olive oil


('olive oil', 3076)

### User-based Collaborative filtering (UCF)

In [21]:
def ucf(r, n):
    s_r = []
    r_ingredients = subset[original_recipes[r]["index"]]["ingredients"]
    B_n = get_top_N(jaccard_recipes[r], n+1)[1:]
    
    J_r = [jaccard_recipes[r,s] for s in B_n]
    B_n_M = np.array([M[i] for i in B_n])
    sum_of_J = sum(J_r)
    for l in range(B_n_M.shape[1]):
        if not unique_ingredients[l] in r_ingredients:
            dot = np.dot(J_r, B_n_M[:,l])
            s_r.append(dot/ sum_of_J)
    return s_r

In [22]:
ratios = []
N_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90,100]
for n in N_list:
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N:", n)
    count = 0
    for i in range(1000):
        index = np.argmax(ucf(i,n))
        if get_missing_ingredient(i) == unique_ingredients[index]:
            #print("N: ", n)
            #print(get_missing_ingredient(i))
            print(unique_ingredients[index])
            print("+++++++")
            count += 1
    ratios.append(float(count)/1000)
    print(ratios[-1])
print(ratios)
#input("Next")        

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 1
all-purpose flour
+++++++


  if sys.path[0] == '':


arborio rice
+++++++
all-purpose flour
+++++++
active dry yeast
+++++++
lemon
+++++++
chunky pasta sauce
+++++++
arborio rice
+++++++
chicken breast halves
+++++++
all-purpose flour
+++++++


  # Remove the CWD from sys.path while we load stuff.


boneless chicken breast halves
+++++++
all-purpose flour
+++++++
0.011
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 10
all-purpose flour
+++++++
all-purpose flour
+++++++
garlic
+++++++
arborio rice
+++++++
arborio rice
+++++++
all-purpose flour
+++++++
active dry yeast
+++++++
arborio rice
+++++++
all-purpose flour
+++++++
boneless chicken breast halves
+++++++
all-purpose flour
+++++++
0.011
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 20
all-purpose flour
+++++++
all-purpose flour
+++++++
garlic
+++++++
arborio rice
+++++++
arborio rice
+++++++
all-purpose flour
+++++++
active dry yeast
+++++++
dried oregano
+++++++
arborio rice
+++++++
all-purpose flour
+++++++
all-purpose flour
+++++++
0.011
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 30
all-purpose flour
+++++++
all-purpose flour
+++++++
all-purpose flour
+++++++
active dry yeast
+++++++
arborio rice
+++++++
all-purpose flour
+++++++
black pepper
+++++++
all-purpose flour
+++++++
0.008
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 40
all-purpose

In [64]:
def icf(r, n):
    s_i = []
    s_i_index = []
    r_ingredients = subset[original_recipes[r]["index"]]["ingredients"]
    for x,i in enumerate(unique_ingredients):
        if not i in r_ingredients:
            
            s_i_index.append(x)
            index = ingredients_dict_index[i]
            B_n = get_top_N(jaccard_ingredients[index], n+1)[1:]
            J_i = [jaccard_ingredients[x,s] for s in B_n]
            #print(len(J_i))
            sum_of_J = np.sum(J_i[0])
            dot = np.dot(J_i[0], M[int(random_indexes[r])])
            s_i.append(dot/ sum_of_J)
    return s_i_index, s_i

In [66]:
ind, sco = icf(0, 10)

  from ipykernel import kernelapp as app


In [78]:
np.nonzero(sco[0])

(array([1145, 1314, 1842, 1949, 1953, 1966, 2023, 2161, 2288]),)

In [83]:
unique_ingredients[ind[0]]

'(    oz.) tomato sauce'