In [155]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction
from heapq import nlargest
from functools import reduce
from operator import add
from numpy import array
from scipy.linalg import svd
import copy
import scipy.spatial.distance as dist
import heapq
import random
import numpy
import time
import string
import json
import re

## Returns a list of 1000 recipes with one random ingredient deleted

In [2]:
def clean_data():
    global recipes_before_removing_element
    
    
    f = open('train.json')
    data = json.load(f)

    recipe_list = []

    for i in data:
        #split with comma to get the cuisine.
        split1=str(i).split(',')
        #clean the strings with split and regex
        split2 = split1[1].split(':')
        cuisine = re.sub(r'[.\W]','',split2[1])
        #get the recipe id
        recipe_split = split1[0].split(': ')
        recipe_id=recipe_split[1]
        #keep only the pepperoni guys
        if cuisine=='italian':
            #splitting with [ because ingredients have the only list in the string
            split3=str(i).split('[')
            #final cut
            ingredients=split3[1].split(',')
            #removing ' from the strings
            ingredients = [s.translate(str.maketrans('','',string.punctuation)) for s in ingredients]
            if len(ingredients)>2:
                recipe_list.append(ingredients)
            
    #randomize the recipes
    random.shuffle(recipe_list)
    #get the first 1000 randomized items
    set_of_recipes = recipe_list[:1000]

    #remove whitespaces
    for i,obj in enumerate(set_of_recipes):
        set_of_recipes[i] = [x.strip() for x in set_of_recipes[i]]
    
    recipes_before_removing_element = copy.deepcopy(set_of_recipes)

    
    #remove 1 random ingredient from every recipes
    for i,obj in enumerate(set_of_recipes):
        #randomise the list and remove the first element
        random.shuffle(set_of_recipes[i])
        set_of_recipes[i].pop(0)        

    return set_of_recipes

## Computes Jaccard similarity


In [3]:
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    # Jaccard = Intersection(set1,set2) / Union(set1,set2)
    if len(set1 | set2) == 0:
        return 0
    else:
        return len(set1 & set2) / len(set1 | set2)

## Returns the binary matrix of the recipes-ingredients

In [4]:
def create_vectorized_matrix():
    matrix =[]
    for i,obj in enumerate(recipes):
        vector = [0] * len(unique_ingredients)
        current_recipe = recipes[i]
        for j,obj in enumerate(current_recipe):
            if current_recipe[j] in unique_ingredients:
                vector[unique_ingredients.tolist().index(current_recipe[j])] = 1
        matrix.append(vector)
    return matrix

## Returns the most popular score

In [5]:
def most_popular_score():
    # Popular_without_recipes_ingredients is a dict but without the ingredients 
    # of the current recipe of the loop.
    score_list = []
    #popular is a dictionary with : (ingredient,times igredient used in all the recipes)
    popular = dict((x,flattened_recipes_set.count(x)) for x in set(flattened_recipes_set))
    for i in recipes:
        popular_without_recipes_ingredients = list(filter(lambda x: x in i, popular)) 
        score_list.append(popular_without_recipes_ingredients)
    return score_list

## Returns a matrix with all the Jaccard similarities


In [6]:
def compute_jaccard_matrix():
    matrix=[]
    for elem1 in recipes:
        #list with the jaccard of the recipe elem1 with every other
        jaccard_of_recipe = []
        for elem2 in recipes:
            jaccard_of_recipe.append(jaccard_similarity(elem1,elem2))
        matrix.append(jaccard_of_recipe)
    return matrix

## Returns a matrix with all the Jaccard similarities of the ingredients


In [7]:
def compute_jaccard_matrix_ingredients():
    matrix=[]
    for i in unique_ingredients:
        jaccard_of_ingredients = []
        for j in unique_ingredients:
            jaccard_of_ingredients.append(jaccard_similarity(list(i),list(j)))
        matrix.append(jaccard_of_ingredients)
    return matrix

## Returns the n most similar for every recipe or ingredient


In [8]:
def get_n_similar_indexes(n,length,jaccard_matrix):
    similar = []
    for i in range(length):
        a = numpy.array(jaccard_matrix[i])
        #the Jaccard similarity of an item with itself is always 1.
        #we are going to get the n+1 largest values so we can pop out
        #the one with itself.

        #gives the indexes of the n+1 largest values og jaccard
        most_similar = heapq.nlargest(n+1, range(len(a)), a.take)

        #remove the most similar(itself)
        most_similar.pop(0)
        similar.append(most_similar)

    return similar

## Returns the Jaccard similarity of the n most similar recipes for every recipe

In [9]:
def get_n_similar_values(n,length,jaccard_matrix):
    values = []
    for i in range(length):
        #gives the values of jaccard of the n+1 largest values
        jaccard = heapq.nlargest(n+1, jaccard_matrix[i])
        #remove the most similar(itself)
        jaccard.pop(0)
        values.append(jaccard)

    return values

## Computes the UCF score of n similar recipes


In [242]:
def user_based_collaborative_filtering(n,matrix,unique):
    scores = []
    for i,obj in enumerate(recipes):
        scores_1=[]
        ingredients_without_recipe_ingredients = list(set(unique) - set(recipes[i]))
        for j in ingredients_without_recipe_ingredients:
            jaccard_dot_matrix_sum = 0
            jaccard_sum = 0
            index_of_ingredient = unique.tolist().index(j)
            for k in range(n):
                m = set_of_n_similar_recipes[i][k]
                jaccard_dot_matrix_sum += most_similar_jaccard_values[i][k] * matrix[m][index_of_ingredient]
                jaccard_sum += most_similar_jaccard_values[i][k]
            if jaccard_sum != 0:    
                scores_1.append(jaccard_dot_matrix_sum/jaccard_sum)
            else:
                scores_1.append(0)        
        scores.append(scores_1)
    return scores

## Computes the ICF score of n similar ingredients

In [243]:
def item_based_collaborative_filtering(n,matrix,unique):
    scores = []
    for i,obj in enumerate(recipes):
        scores_1=[]
        ingredients_without_current = list(set(unique) - set(recipes[i]))
        for j,obj in enumerate(ingredients_without_current):
            index_of_ingredient = unique.tolist().index(ingredients_without_current[j])
            jaccard_dot_matrix_sum = 0
            jaccard_sum = 0
            for k in range(n):
                m = set_of_n_similar_ingredients[index_of_ingredient][k]
                jaccard_dot_matrix_sum += most_similar_jaccard_values_ingredients[index_of_ingredient][k] * matrix[i][m]
                jaccard_sum += most_similar_jaccard_values_ingredients[index_of_ingredient][k]
            if jaccard_sum != 0:    
                scores_1.append(jaccard_dot_matrix_sum/jaccard_sum)
            else:
                scores_1.append(0)  
        scores.append(scores_1)

    return scores

## MAIN

In [119]:
global most_similar_jaccard_values
global set_of_n_similar_recipes
global recipes
global unique_ingredients
global flattened_recipes_set
global scores
global recipes_before_removing_element
global set_of_recipes


recipes = clean_data()

#make the 2d list 1d so we can count occurences easily
flattened_recipes_set = copy.deepcopy(recipes)
flattened_recipes_set = reduce(lambda x,y :x+y ,flattened_recipes_set)

#remove whitespaces
#flattened_recipes_set = [x.strip() for x in flattened_recipes_set]

#find the unique ingredients list
numpy_of_flattened = numpy.array(flattened_recipes_set)
unique_ingredients = numpy.unique(numpy_of_flattened)

#Create Vectorized Matrix
vectorized_matrix_m = create_vectorized_matrix()

#list that holds every score for every ingredient.
#every line i of the array corresponds to the score
#the ingredients have for the recipe in line i of the recipes array.
most_popular_score_list = most_popular_score()


distance_recipes = compute_jaccard_matrix()
distance_ingredients = compute_jaccard_matrix_ingredients()

print("distance_recipes: row x lengths")
print(len(distance_recipes))
print("x")
print(len(distance_recipes[0]))
print('--------------------')
print("distance_ingredients: row x lengths")
print(len(distance_ingredients))
print("x")
print(len(distance_ingredients[1]))


#distance array now has in line i the jacard similarity of
#the recipe of line i in the recipes set with every other recipe
    

distance_recipes: row x lengths
1000
x
1000
--------------------
distance_ingredients: row x lengths
1249
x
1249


# UCF Scoring

In [308]:
#set_of_n_similar_recipes has in each line i the n most similar recipes
#for the recipe in line i of the recipes array according to the jaccard metric

list_of_lists_of_scores_for_every_n_ucf = []
recipes_length = len(recipes)

##for testing
list_of_lists_of_most_similar_recipes_for_every_n_ucf = []
list_of_lists_of_most_similar_jaccard_values_for_every_n_ucf = []


for i in range(25, 101, 25):
    print("now computing for n=:",i)
    set_of_n_similar_recipes = get_n_similar_indexes(i,recipes_length,distance_recipes)
    most_similar_jaccard_values = get_n_similar_values(i,recipes_length,distance_recipes)
    ucf_scores = user_based_collaborative_filtering(i,vectorized_matrix_m,unique_ingredients)
    print(len(ucf_scores))
    print(len(ucf_scores[1]))

    ##for testing
    list_of_lists_of_most_similar_recipes_for_every_n_ucf.append(set_of_n_similar_recipes)
    list_of_lists_of_most_similar_jaccard_values_for_every_n_ucf.append(most_similar_jaccard_values)
    ##
    
    list_of_lists_of_scores_for_every_n_ucf.append(ucf_scores)
        

now computing for n=: 25
1000
1243
now computing for n=: 50
1000
1243
now computing for n=: 75
1000
1243
now computing for n=: 100
1000
1243


In [284]:
# print("a random recipe with values to test if the jaccard is correct")
# print("88th recipe:")
# print(sorted(recipes[88]))
# print('================')
# print("Jaccard of second recipe with 88th recipe")
# print(distance_recipes[1][88])
# print('================')
a = list_of_lists_of_most_similar_jaccard_values_for_every_n_ucf[1]
b = list_of_lists_of_most_similar_recipes_for_every_n_ucf[1]
# #print(len(a))
# #print(len(b))
# print("5 most similar recipes(indexes) of the 2th recipe")
# print(b[1])
# print("5 most similar jaccard values of the 2th recipe between these:")
# print(a[1])
# print("================")
# # for i in b[0]:
# #     print(i)
# #     print(recipes[i])
# #     print('%%%%%%%%%%%')
# print("=================")
# sum_of_jaccard = sum(a[1])
# print("sum of 5 most similar jaccard values of the 0th recipe between these:")
# print(sum_of_jaccard)
# print("=================")



test_recipe_number = 345

# print("binary matrix values")
# print("M[r',i]")
# for i in b[1]:
#     print("-----------------")
#     print(recipes[i])
#     print("-----------------")
#     for j in sorted(sorted(recipes[test_recipe_number])):
#         print("M[",i,",",j,"]",vectorized_matrix_m[i][unique_ingredients.tolist().index(j)])
#     print("=================")

print("recipe before removing:")
print(sorted(recipes_before_removing_element[test_recipe_number]))
print("recipe:")
print(sorted(recipes[test_recipe_number]))
print('================')
hidden_element = list(set(recipes_before_removing_element[test_recipe_number]) - set(recipes[test_recipe_number]))
print(hidden_element)
print("algorithm found:")
#print("UCF scores for:")
#print(len(list_of_lists_of_scores_for_every_n_ucf[1][test_recipe_number]))
max_value = max(list_of_lists_of_scores_for_every_n_ucf[1][test_recipe_number])
max_index = list_of_lists_of_scores_for_every_n_ucf[1][test_recipe_number].index(max_value)
print(unique_ingredients[max_index])



recipe before removing:
['chopped fresh sage', 'cracked black pepper', 'fresh oregano', 'fresh parsley', 'noodles', 'olive oil', 'parmigiano reggiano cheese', 'partskim ricotta cheese', 'pinenuts', 'salt', 'unsalted butter']
recipe:
['chopped fresh sage', 'cracked black pepper', 'fresh oregano', 'fresh parsley', 'noodles', 'olive oil', 'parmigiano reggiano cheese', 'partskim ricotta cheese', 'pinenuts', 'unsalted butter']
['salt']
algorithm found:
loin pork roast


## ICF Scoring


In [293]:
ingredients_length = len(unique_ingredients)
list_of_lists_of_scores_for_every_n_icf = []

for i in range(25, 101, 25):
    print("Now computing ICF for N=",i)
    set_of_n_similar_ingredients = get_n_similar_indexes(i,ingredients_length,distance_ingredients)
    most_similar_jaccard_values_ingredients = get_n_similar_values(i,ingredients_length,distance_ingredients)

    icf_scores = item_based_collaborative_filtering(i,vectorized_matrix_m,unique_ingredients)
    list_of_lists_of_scores_for_every_n_icf.append(icf_scores)
    

Now computing ICF for N= 25
Now computing ICF for N= 50
Now computing ICF for N= 75
Now computing ICF for N= 100


## SVD

In [307]:
list_of_lists_of_scores_for_every_n_svd_U = []
list_of_lists_of_scores_for_every_n_svd_s = []
list_of_lists_of_scores_for_every_n_svd_VT = []

for i in range(25, 101, 25):
    U, s, VT = svd(vectorized_matrix_m)
    list_of_lists_of_scores_for_every_n_svd_U.append(U)
    list_of_lists_of_scores_for_every_n_svd_s.append(s)
    list_of_lists_of_scores_for_every_n_svd_VT.append(VT)

    


## Precision ucf

In [309]:
for i,obj in enumerate(list_of_lists_of_scores_for_every_n_ucf):
    found_sum = 0
    scores = list_of_lists_of_scores_for_every_n_ucf[i]
    precisions_ucf = []
    
    for j,obj in enumerate(recipes):
        hidden_element = list(set(recipes_before_removing_element[j]) - set(recipes[j]))
        #print(hidden_element)
        #for K=1 we just want the maximum value
        max_value = max(scores[j])
        max_index = scores[j].index(max_value)
        
        ingredient_proposed_by_the_algorithm = unique_ingredients[max_index]

        if ingredient_proposed_by_the_algorithm == hidden_element[0]:
            found_sum += 1
            
    print("iteration:",i)
    print("precision:",found_sum/len(recipes))
    precisions_ucf.append(found_sum/len(recipes))

    
max_value_precisions_ucf = max(precisions_ucf)
max_index_precisions_ucf = precisions_ucf.index(max_value_precisions_ucf)

print(precisions_ucf)

best_n_value_ucf = max_index_precisions_ucf*25 +25

print("Best precision for ucf is computed by:", best_n_value_ucf)
    

iteration: 0
precision: 0.0
iteration: 1
precision: 0.0
iteration: 2
precision: 0.0
iteration: 3
precision: 0.0
[0.0]
Best precision for ucf is computed by: 25


## Precision Icf

In [304]:
for i,obj in enumerate(list_of_lists_of_scores_for_every_n_icf):
    found_sum = 0
    scores = list_of_lists_of_scores_for_every_n_icf[i]
    precisions_icf = []
    
    for j,obj in enumerate(recipes):
    
        hidden_element = list(set(recipes_before_removing_element[j]) - set(recipes[j]))
        
        #for K=1 we just want the maximum value
        max_value = max(scores[j])
        max_index = scores[j].index(max_value)
        
        ingredient_proposed_by_the_algorithm = unique_ingredients[max_index]
        
        if ingredient_proposed_by_the_algorithm == hidden_element[0]:
            found_sum += 1
            
    print("iteration:",i)
    print("precision:",found_sum/len(recipes))
    precisions_icf.append(found_sum/len(recipes))


max_value_precisions_icf = max(precisions_icf)
max_index_precisions_icf = precisions_icf.index(max_value_precisions_icf)

best_n_value_icf = max_index_precisions_icf*25 +25

print("Best precision for icf is computed by N:", best_n_value_icf)
    

iteration: 0
precision: 0.001
iteration: 1
precision: 0.001
iteration: 2
precision: 0.0
iteration: 3
precision: 0.001
Best precision for icf is computed by N: 25
