In [1]:
import numpy as np
import json
import random
import sklearn.cluster as sk_cluster
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score

In [2]:
global ingredients_sort # list with sorted pairs of kay,values in descending order according to value
global ingredients_dict_sort
global ingredients_dict_count
global subset # list of recipes dicts, with missing ingredient recipes
global unique_ingredients # ingredients list
global original_recipes # 1000 recipes in original form, these recipes miss an ingredient in M
global jaccard_recipes # jaccard score matrix for recipes

## Create the data

In [3]:
f = open("train.json")
data = json.load(f)
len(data)

39774

Keep only italian and non empty ingresients lists

In [4]:
subset = [i for i in data if i["cuisine"]=="italian" and
          not i["ingredients"] is None]

Make a list with all the ingredients in our dataset (only one entry for every ingredient)

In [5]:
ingredients_list = [np.array(i["ingredients"]) for i in subset]
unique_ingredients = np.concatenate(ingredients_list)
unique_ingredients = np.sort(np.unique(unique_ingredients))

Pick 1000 random indexes to remove a single ingredient from each recipe that points

In [6]:
random_indexes = random.sample(range(0, len(subset)-1), 1000)

In [7]:
original_recipes = []
for i in random_indexes:
    # Copy the recipe and sort the ingredients list
    temp = subset[i].copy()
    temp["index"] = i
    temp["ingredients"] = np.sort(temp["ingredients"])
    # Save the original recipe and its index
    original_recipes.append(temp)
    # Shuffle the ingredients and remove the last and sort them again
    random.shuffle(subset[i]["ingredients"])
    subset[i]["ingredients"] = np.sort(subset[i]["ingredients"][:-1])
    
    

print(original_recipes[0]["ingredients"])
print(subset[original_recipes[0]["index"]]["ingredients"])

['chickpeas' 'extra-virgin olive oil' 'sea salt' 'water']
['extra-virgin olive oil' 'sea salt' 'water']


Create italian_recipes * unique_ingredients matrix (This is M matrix that is created from the subset that we removed the ingredients from 1000 recipes)

In [8]:
M = np.zeros((len(subset), len(unique_ingredients)))

for x, r in enumerate(subset):
    for i in r["ingredients"]:
        y = np.where(unique_ingredients == i)
        M[x, y] = 1

In [9]:
len(random_indexes)

1000

In [10]:
R = np.zeros((len(random_indexes), len(unique_ingredients)))
for k, x in enumerate(random_indexes):
    for i in subset[x]["ingredients"]:
        y = np.where(unique_ingredients == i)
        R[k, y] = 1


In [11]:
R.shape

(1000, 2929)

Create a dictionary with the ingredients frequencies.

In [12]:
ingredients_dict_count = {}

for r in subset:
    for i in r["ingredients"]:
        if i in ingredients_dict_count.keys():
            ingredients_dict_count[i] += 1
        else:
            ingredients_dict_count[i] = 1



In [13]:
ingredients_dict_index = {}
for i, obj in enumerate(unique_ingredients):
    ingredients_dict_index[obj] = i

In [14]:
ingredients_dict_count["salt"]

3405

Create a sorted by value list to store  (key, frequency) in descending order.

In [15]:
ingredients_sort = sorted(ingredients_dict_count.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)

In [16]:
ingredients_sort

[('salt', 3405),
 ('olive oil', 3069),
 ('garlic cloves', 1596),
 ('grated parmesan cheese', 1556),
 ('garlic', 1451),
 ('ground black pepper', 1433),
 ('extra-virgin olive oil', 1340),
 ('onions', 1224),
 ('water', 1037),
 ('butter', 1021),
 ('pepper', 951),
 ('all-purpose flour', 904),
 ('fresh basil', 780),
 ('sugar', 755),
 ('dry white wine', 651),
 ('kosher salt', 648),
 ('black pepper', 632),
 ('fresh parsley', 625),
 ('dried oregano', 622),
 ('eggs', 621),
 ('large eggs', 619),
 ('tomatoes', 594),
 ('flat leaf parsley', 583),
 ('unsalted butter', 554),
 ('cooking spray', 486),
 ('parmesan cheese', 467),
 ('fresh lemon juice', 466),
 ('diced tomatoes', 426),
 ('dried basil', 423),
 ('crushed red pepper', 414),
 ('shredded mozzarella cheese', 407),
 ('mozzarella cheese', 393),
 ('carrots', 376),
 ('tomato paste', 373),
 ('tomato sauce', 349),
 ('red bell pepper', 349),
 ('purple onion', 346),
 ('fresh basil leaves', 344),
 ('italian seasoning', 343),
 ('balsamic vinegar', 343),
 (

Jaccard Similarity Matrix (Recipes)

In [17]:
a = np.array([[1,1,1,1],[0, 0, 1, 1]])

a

array([[1, 1, 1, 1],
       [0, 0, 1, 1]])

In [18]:
b = np.array([[0, 0, 0, 1], [1,1,0,1], [1,0,0,1]])
b

array([[0, 0, 0, 1],
       [1, 1, 0, 1],
       [1, 0, 0, 1]])

In [19]:
jaccard_score(a[0], b[0])
#metrics.pairwise.pairwise_distances(a, b,metric = 'jaccard')

0.25

In [20]:
jaccard_recipes = metrics.pairwise.pairwise_distances(R,M,metric = 'jaccard')
jaccard_recipes = np.subtract(np.ones((R.shape[0],M.shape[0])), jaccard_recipes)
jaccard_recipes = np.absolute(jaccard_recipes)



In [21]:
get_top_N(jaccard_recipes[0], 10)

NameError: name 'get_top_N' is not defined

In [None]:
jaccard_recipes[0][3226]

In [None]:
print(original_recipes[0]["ingredients"])
print(subset[original_recipes[0]["index"]]["ingredients"])
get_missing_ingredient(0)

In [None]:
print(subset[3226]["ingredients"])

## Score Formulas

### General purpose functions

In [22]:
def score(r, i , formula):
    
    if formula == "mp":
        return mp(r,i)
        # call most popular
    elif formula == "ucf":
        return 1
        # call micf
    elif formula == "icf":
        return 2
        # call icf
    else:
        print("Wrong formula input.")

In [23]:
def get_missing_ingredient(r):
    for k, i in enumerate(subset[original_recipes[r]["index"]]["ingredients"]):
        
        if i != original_recipes[r]["ingredients"][k]:
            return original_recipes[r]["ingredients"][k]
    return original_recipes[r]["ingredients"][-1]

In [24]:
def get_top_N(in_array, N):
    args = np.argsort(in_array)
    args = list(reversed(args[-N:]))
    return args

### Most Popular Ingredient (MP)

In [25]:
def mp(r, i):
    for p in ingredients_sort:
        if not p[0] in subset[original_recipes[r]["index"]]["ingredients"]:
            print("Expecting: ", get_missing_ingredient(r))
            print("Returned:  ", p[0])
            return p

In [26]:
mp(100, "salt")

Expecting:  fresh tarragon
Returned:   grated parmesan cheese


('grated parmesan cheese', 1556)

### User-based Collaborative filtering (UCF)

In [27]:
def ucf(r, n):
    s_r = []
    r_ingredients = subset[original_recipes[r]["index"]]["ingredients"]
    B_n = get_top_N(jaccard_recipes[r], n)
    J_r = [jaccard_recipes[r,s] for s in B_n]
    B_n_M = np.array([M[i] for i in B_n])
    sum_of_J = sum(J_r)
    for l in range(B_n_M.shape[1]):
        if not unique_ingredients[l] in r_ingredients:
            dot = np.dot(J_r, B_n_M[:,l])
            s_r.append(dot/ sum_of_J)
    return s_r

In [31]:
ratios = []
for n in range(1,100, 10):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N:", n)
    count = 0
    for i in range(1000):
        index = np.argmax(ucf(i,n))
        if get_missing_ingredient(i) == unique_ingredients[index]:
            print("N: ", n)
            print(get_missing_ingredient(i))
            print(unique_ingredients[index])
            print("+++++++")
            count += 1
    ratios.append(float(count)/1000)
print(ratios)
#input("Next")        

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 11
N:  11
fresh lemon juice
fresh lemon juice
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
cracked black pepper
cracked black pepper
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
basil
basil
+++++++
N:  11
boiling water
boiling water
+++++++
N:  11
all-purpose flour
all-purpose flour
+++++++
N:  11
fresh basil
fresh basil
+++++++
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..N: 21
N:  21
tomato basil sauce
tomato basil sauce
+++++++
N:  21
all-purpose flour
all-purpose flour
+++++++
N:  21
all-purpose flour
all-purpose flour
+++++++
N:  21
all-purpose flour
all-purpose flour
+++++++
N:  21
lemon
lemon
+++++++
N:  21
all-purpose flour
all-purpose flour
+++++++
N:  21
all-