In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [37]:
recipes_df = pd.read_json("recipes.json")

In [38]:
recipes_df = recipes_df[recipes_df["ingredients"].apply(lambda x: len(x) > 2)]

# Named Entity Recognition

In [51]:
from transformers import *

In [None]:
ner = pipeline("ner", aggregation_strategy="simple", model="davanstrien/deberta-v3-base_fine_tuned_food_ner")

In [39]:
# Using nlk lemmatizer for generating lemmatized words
import nltk
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [54]:
# This function takes recipe list as input and preprocesses it, keeping only ingredients and removing unnecessary words like quantities and processes
def preprocess(ing_list):
    
    text = " ,cup, ".join(ing_list).lower()
    text = text.replace(" ","   ")
    doc_ner = ner(text)
    ings = ""
    count = 0
    for n,i in enumerate(doc_ner):
        if count>0:
            count=count-1
            if count==0:
                ings = ings + ","
            continue
        prev = None

        while n<len(doc_ner) and "FOOD" in doc_ner[n]["entity_group"]:
            if prev and doc_ner[n]["start"] == prev+2:
                ings = ings + " " + doc_ner[n]["word"]
                count = count + 1
                prev = doc_ner[n]["end"]
                n = n + 1

            elif prev and doc_ner[n]["start"] > prev+2:
                ings = ings + "," + doc_ner[n]["word"]
                count = count + 1
                prev = doc_ner[n]["end"]
                n = n + 1

            else:
                ings = ings + "" + doc_ner[n]["word"]
                count = count + 1
                prev = doc_ner[n]["end"]
                n = n + 1
                
    ings = list(set(list(filter(None, ings.split(",")))))
    singular_ings = [clean_spacy(i) for i in ings]
    return singular_ings

In [45]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [53]:
# using spacy lemmatization to convert plural nouns into singular ones
def clean_spacy(text):
    doc = nlp(text)
    clean_text = " ".join([token.lemma_ if token.pos_ == 'NOUN' else token.text for token in doc])
    return clean_text

In [55]:
from tqdm import tqdm

In [56]:
processed_ingredients = []
for i in tqdm(recipes_df["ingredients"]):
    processed_ingredients.append(preprocess(i))

  0%|                                                                                         | 0/9990 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|████████████████████████████████████████████████████████████████████████████| 9990/9990 [2:04:15<00:00,  1.34it/s]


In [57]:
recipes_df["preproc_ingredients"] = processed_ingredients
recipes_df = recipes_df.reset_index()

In [62]:
rec_ing_map = recipes_df[["index","preproc_ingredients"]]
rec_ing_map = rec_ing_map.explode('preproc_ingredients').reset_index(drop=True)

In [65]:
rec_ing_map["preproc_ingredients"] = rec_ing_map["preproc_ingredients"].str.strip()
rec_ing_map = rec_ing_map.groupby('preproc_ingredients')['index'].apply(list).reset_index()

In [80]:
ingredients = pd.DataFrame(list(rec_ing_map["preproc_ingredients"]),columns=["name"])

In [71]:
rec_ing_map = pd.read_json("rec_ing_map.json")

In [70]:
rec_ing_map.to_json("rec_ing_map.json")

# Ingredient similiarity:

In [73]:
import tensorflow_hub as hub

In [74]:
import pickle

In [75]:
# loading universal sentense encoder model
embed_model = hub.load(r"D:\Jupyter\bd_2\project\universal-sentence-encoder")

In [76]:
products_df = pd.read_csv("data/products.csv")

In [77]:
# filtering aisles containing only food related products
products_df = products_df[products_df["aisle_id"].isin([2,4,5,7,9,12,15,14,16,17,18,19,21,24,26,28,29,30,31,32,33,34,35,36,37,39,42,53,63,66,68,69,72,76,81,84,88,89,95,96,98,99,104,105,106,108,110,112,116,117,122,123,128,131])]

In [78]:
products = pd.DataFrame(list(products_df["product_name"]),columns=["name"])

In [79]:
# This function returns universal sentense encoder embeddings
def gen_use_embedding(name):
    embeddings = embed_model(name)
    return embeddings[0]

In [85]:
products["embed"] = products["name"].apply(lambda x: gen_use_embedding([clean_spacy(x.lower())]))

In [86]:
ingredients["embed"] = ingredients["name"].apply(lambda x: gen_use_embedding([x.lower()]))

In [87]:
from collections import Counter

In [88]:
from scipy import spatial

In [89]:
# matching recipe ingredients to instacart products
matched_products=[]
for i,j in tqdm(zip(products["embed"], products["name"])):
    matching_score = {}
    for p,q in zip(ingredients["embed"], ingredients["name"]):
        matching_score[q] = 1 - spatial.distance.cosine(i, p)
    matching_score = Counter(matching_score)
    top_matched_products = matching_score.most_common(5)
    prods = []
    for k,v in top_matched_products:
        if v > 0.7:
            prods.append(k)
    matched_products.append(prods)

19593it [2:07:36,  2.56it/s] 


In [None]:
# tested sbert embeddings but, USE embeddings are giving better results
sbert_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
def gen_sbert_embed(text):
    vectors = model.encode(text)
    return vectors

In [90]:
products["matched_ingredients"] = matched_products

In [96]:
products.to_pickle("products_map.pkl")

In [93]:
prod_ing_map = pd.read_pickle("prod_ing_map.pkl")

In [98]:
prod_ing_map = products[["name","matched_ingredients"]]

In [None]:
for i in prod_ing_map.index:
    if "onion" in prod_ing_map["name"][i].lower():
        print(prod_ing_map["name"][i].lower())
        print(prod_ing_map["matched_ingredients"][i])
        print("\n************************************")

In [100]:
prod_ing_map = prod_ing_map[prod_ing_map['matched_ingredients'].apply(lambda x: len(x) > 0)]

# Recipe recomedation:

In [None]:
rec_ing_map = pd.read_json("rec_ing_map.json")

In [103]:
prod_ing_map = prod_ing_map.explode('matched_ingredients').reset_index(drop=True)

In [104]:
rec_ing_map = rec_ing_map.merge(prod_ing_map, left_on="preproc_ingredients",right_on="matched_ingredients",how="inner")

In [None]:
prods_rec_map = pd.read_json("prods_rec_map")

In [110]:
prods_rec_map

Unnamed: 0,name,recipe_index
0,1% Chocolate Milk,"[354, 422, 528, 635, 664, 704, 719, 748, 778, ..."
1,1% Lowfat Cottage Cheese,"[3020, 4302, 4610, 5647, 5666, 6113, 7043, 922..."
2,1% Milk,"[6542, 27, 31, 35, 36, 39, 40, 54, 55, 58, 59,..."
3,1/4 Pound Burgers,"[3059, 7134]"
4,10 Grain Bread Mix,"[3072, 6438, 7417, 8255, 8388, 9032, 3161]"
...,...,...
10642,for Tots Apple Juice,"[1521, 1892, 2223, 2492, 2940, 3078, 3249, 349..."
10643,gelato Coffee Toffee,"[336, 4264, 5521, 2086, 7332]"
10644,pumpkin spice,"[42, 82, 104, 293, 402, 526, 549, 767, 1057, 1..."
10645,with Olive Oil Mayonnaise,"[295, 358, 562, 1471, 1676, 2357, 3457, 4162, ..."


In [105]:
prods_rec_map = rec_ing_map[["name","index","matched_ingredients"]]

In [106]:
prods_rec_map = prods_rec_map.explode('index').reset_index(drop=True)

In [107]:
prods_rec_map = prods_rec_map.rename(columns={"index": "recipe_index"})

In [111]:
prods_rec_map.to_json("prods_rec_map.json")

In [None]:
for i,j in zip(prods_rec_map["name"],prods_rec_map["recipe_index"]):
    if "tomato" == i.lower():
        print(i)
        print(j)
        print("\n****************************")

In [109]:
prods_rec_map = prods_rec_map.groupby('name')['recipe_index'].apply(list).reset_index()

In [None]:
prods_rec_map = pd.read_json("prods_rec_map")
recipes_df = pd.read_json("recipes.json")

# Inference:

In [113]:
import itertools
import pandas as pd

prods_rec_map = pd.read_json("prods_rec_map")
recipes_df = pd.read_json("recipes.json")

def get_intersection_recipes(input_prod_list):
    recipe_lists = [list(prods_rec_map[prods_rec_map["name"]==i]["recipe_index"].values)[0] for i in input_prod_list]
    common_elements = set(recipe_lists[0]).intersection(*recipe_lists[1:])
    recipes = recipes_df.iloc[list(common_elements),[0,1]]
    return recipes

def get_combinations(input_prod_list):
    combinations = []
    for i in range(2, len(input_prod_list) + 1):
        combinations += itertools.combinations(input_prod_list, i)
    return combinations

def get_recipes(input_prod_list):
    combinations = get_combinations(input_prod_list)
    
    merged_df = pd.concat([get_intersection_recipes(c) for c in combinations])
    recipes_df = merged_df.sample(frac=1)
    return recipes_df.head(10)

In [114]:
input_prod_list = ["Chopped Onion","Tomato","Organic Hass Avocado","Ground Lamb","Pomegranates"]
get_recipes(input_prod_list)

Unnamed: 0,title,link
4807,Spicy rice burritos,https://www.bbcgoodfood.com/recipes/spicy-rice...
7155,Moroccan chard & lamb pan-fry,https://www.bbcgoodfood.com/recipes/moroccan-c...
4360,Aubergine & goat's cheese pasta,https://www.bbcgoodfood.com/recipes/aubergine-...
2097,Lamb scouse,https://www.bbcgoodfood.com/recipes/lamb-scouse
9882,Indian turkey with spinach & new potatoes,https://www.bbcgoodfood.com/recipes/indian-tur...
910,Beef curry,https://www.bbcgoodfood.com/recipes/beef-curry
7805,Russian shashlik with rhubarb sauce,https://www.bbcgoodfood.com/recipes/russian-sh...
8607,Lamb shanks with chickpeas & Moroccan spices,https://www.bbcgoodfood.com/recipes/lamb-shank...
8607,Lamb shanks with chickpeas & Moroccan spices,https://www.bbcgoodfood.com/recipes/lamb-shank...
2526,Sweet potato tacos,https://www.bbcgoodfood.com/premium/sweet-pota...
