In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("./data/parsed_ds.csv", nrows=5000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   recipe_id           5000 non-null   int64 
 1   recipe_name         5000 non-null   object
 2   ingredients_parsed  5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [3]:
df = df.dropna()

In [4]:
df_ing_parsed = df["ingredients_parsed"].values.astype("U")

In [5]:
df_ing_parsed

array(['sauerkraut granny smith apples onion caraway apple cider rub thai garlic powder pepper pork loin',
       'chicken wings sprigs rosemary garlic olive lemon pepper',
       'focaccia bread basil pesto chicken bell pepper onion jack cheese',
       ...,
       'smithfield® rosemary olive pork tenderloin sandwich bread olive mayonnaise capers lemon pepper arugula havarti cheese dill',
       'beef chuck onion garlic powder barbeque sauce biscuit cheddar cheese',
       'navy beans soaked overnight ketchup molasses onion mustard bacon'],
      dtype='<U386')

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
tfidf_recipe = vectorizer.fit_transform(df_ing_parsed)

### Create cosine-similarity matrix

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
tfidf_recipe.shape

(5000, 1992)

In [10]:
cosine_sim = cosine_similarity(tfidf_recipe, tfidf_recipe)

In [12]:
print(cosine_sim)

[[1.         0.02737998 0.02049161 ... 0.02900325 0.05889979 0.01027878]
 [0.02737998 1.         0.04300483 ... 0.22471369 0.02751899 0.        ]
 [0.02049161 0.04300483 1.         ... 0.06573276 0.05043095 0.01484717]
 ...
 [0.02900325 0.22471369 0.06573276 ... 1.         0.01989012 0.        ]
 [0.05889979 0.02751899 0.05043095 ... 0.01989012 1.         0.01736296]
 [0.01027878 0.         0.01484717 ... 0.         0.01736296 1.        ]]


In [13]:
import numpy as np

In [17]:
def recommend(strOfIngredients):
    #create a list of skills
    #skills = listOfSkills
    #to lower case and remove spaces
    #skills = [i.lower().strip() for i in skills]
    #remove duplicates
    #skills = list(dict.fromkeys(skills))
    #all list of single string
    #skills = ' '.join(skills)
    #vectorize the skills
    ing_v = vectorizer.transform([strOfIngredients])
    
    #se calcula la similitud del coseno de la lista de skills dadas con el resto de las listas de skills
    #con eso se va a obtener un vector de similitud con cada uno de los trabajos de la lista de trabajos    
    similarity_list = cosine_similarity(ing_v, tfidf_recipe)
    
    #sort the list of similarity in order desc and get the index
    #es una lista sorteada de distancias de menor a mayor, nosotros necesitamos la mayor similitud, por eso se hace sort descendentemente 
    #para obtener los indices de la mayor similitud
    sorted_indexes = np.argsort(similarity_list[0])[::-1]
    
    #get 10 recommendations jobs
    return df['recipe_name'].iloc[sorted_indexes].values[0:20]

In [18]:
print(df["ingredients_parsed"].iloc[0])

sauerkraut granny smith apples onion caraway apple cider rub thai garlic powder pepper pork loin


In [20]:
print(recommend(df["ingredients_parsed"].iloc[0]))
print(recommend("tomato"))

['Pork Loin, Apples, and Sauerkraut'
 'Grilled Pork Tenderloin with Fried Apples'
 'Slow Cooker Lancaster County Pork and Sauerkraut'
 'Slow Cooker German-Style Pork Roast with Sauerkraut and Potatoes'
 'Apple Cheddar Pork' 'Very Moist and Flavorful Roast Turkey'
 'Creamy Cabbage with Apples and Bacon' "Christy's Pork Chops Normandy"
 'Almond Crusted Pork with Apple-Rosemary Sauce'
 'Pork Chops and Sauerkraut' 'Very Old Meatloaf Recipe'
 'Cranberry and Apple Stuffed Pork Chops'
 'German Pork Chops and Sauerkraut' 'Sausage and Sauerkraut'
 'Apple Cranberry Stuffed Pork Chops' 'Apple and Pork Stew'
 'Brined and Stuffed Pork Loin Roast' 'Thanksgiving Meatloaf'
 'Cabbage Apple Soup' 'Roasted Pork Tenderloin']
['Amazing Ground Turkey Tomato Sauce' 'Rice and Beef Stuffed Tomatoes'
 'Johnny Marzetti III' 'Lancaster County Stuffed Green Pepper Boats'
 "Mom's Sweet Spaghetti Sauce" 'Hotdish No One Likes'
 'Stuffed Red Peppers' "Grandma Slattery's Michigan Sauce"
 'Tomato Bacon Grilled Cheese' '