TODO: Add Title
TODO: Add TOC
TODO: Add Goals

In this notebook, we will load the cleaned dataset created in the first notebook (full_format_recipes_cleaned.csv) and do preprocessing on the data to get it ready for modeling.

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer


In [76]:
f"pandas version: {pd.__version__}"

'pandas version: 2.1.4'

## Load the cleaned data

In [77]:
df = pd.read_csv('../data/interim/full_recipes_cleaned_2.csv')
df.shape

(14526, 7)

In [78]:
df.head()

Unnamed: 0,recipeId,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
0,0,426.0,2.5,"Lentil, Apple, and Turkey Wrap","['1. Place the stock, lentils, celery, carrot,...","['Sandwich', 'Bean', 'Fruit', 'Tomato', 'turke...",['4 cups low-sodium vegetable or chicken stock...
1,1,403.0,4.375,Boudin Blanc Terrine with Red Onion Confit,['Combine first 9 ingredients in heavy medium ...,"['Food Processor', 'Onion', 'Pork', 'Bake', 'B...","['1 1/2 cups whipping cream', '2 medium onions..."
2,2,165.0,3.75,Potato and Fennel Soup Hodge,['In a large heavy saucepan cook diced fennel ...,"['Soup/Stew', 'Dairy', 'Potato', 'Vegetable', ...","['1 fennel bulb (sometimes called anise), stal..."
3,4,547.0,3.125,Spinach Noodle Casserole,['Preheat oven to 350°F. Lightly grease 8x8x2-...,"['Cheese', 'Dairy', 'Pasta', 'Vegetable', 'Sid...","['1 12-ounce package frozen spinach soufflé, t..."
4,5,948.0,4.375,The Best Blts,"['Mix basil, mayonnaise and butter in processo...","['Sandwich', 'Food Processor', 'Tomato', 'Kid-...",['2 1/2 cups (lightly packed) fresh basil leav...


In [79]:
df.set_index('recipeId')

Unnamed: 0_level_0,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
recipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,426.0,2.500,"Lentil, Apple, and Turkey Wrap","['1. Place the stock, lentils, celery, carrot,...","['Sandwich', 'Bean', 'Fruit', 'Tomato', 'turke...",['4 cups low-sodium vegetable or chicken stock...
1,403.0,4.375,Boudin Blanc Terrine with Red Onion Confit,['Combine first 9 ingredients in heavy medium ...,"['Food Processor', 'Onion', 'Pork', 'Bake', 'B...","['1 1/2 cups whipping cream', '2 medium onions..."
2,165.0,3.750,Potato and Fennel Soup Hodge,['In a large heavy saucepan cook diced fennel ...,"['Soup/Stew', 'Dairy', 'Potato', 'Vegetable', ...","['1 fennel bulb (sometimes called anise), stal..."
4,547.0,3.125,Spinach Noodle Casserole,['Preheat oven to 350°F. Lightly grease 8x8x2-...,"['Cheese', 'Dairy', 'Pasta', 'Vegetable', 'Sid...","['1 12-ounce package frozen spinach soufflé, t..."
5,948.0,4.375,The Best Blts,"['Mix basil, mayonnaise and butter in processo...","['Sandwich', 'Food Processor', 'Tomato', 'Kid-...",['2 1/2 cups (lightly packed) fresh basil leav...
...,...,...,...,...,...,...
20125,28.0,3.125,Parmesan Puffs,['Beat whites in a bowl with an electric mixer...,"['Mixer', 'Cheese', 'Egg', 'Fry', 'Cocktail Pa...","['2 large egg whites', '3 oz Parmigiano-Reggia..."
20126,671.0,4.375,Artichoke and Parmesan Risotto,['Bring broth to simmer in saucepan.Remove fro...,"['Side', 'Kid-Friendly', 'High Fiber', 'Dinner...",['5 1/2 cups (or more) low-salt chicken broth'...
20127,563.0,4.375,Turkey Cream Puff Pie,"['Using a sharp knife, cut a shallow X in bott...","['Onion', 'Poultry', 'turkey', 'Vegetable', 'B...","['1 small tomato', '1 small onion, finely chop..."
20128,631.0,4.375,Snapper on Angel Hair with Citrus Cream,['Heat 2 tablespoons oil in heavy medium skill...,"['Milk/Cream', 'Citrus', 'Dairy', 'Fish', 'Gar...","['4 tablespoons olive oil', '4 shallots, thinl..."


In [80]:
# confirm that there are no null values or duplicated values
print(f"Null values: {df.isna().sum().sum()}")
print(f"Duplicated rows: {df.duplicated().sum()}")

Null values: 0
Duplicated rows: 0


## Vectorize

### Define a custom tokenizer

In [81]:
# Remove units of measurements such as teaspoons, cups, ounces etc. Full list at https://en.wikibooks.org/wiki/Cookbook:Units_of_measurement
measurements = set(line.strip() for line in open('../data/interim/measurement_list.txt'))

# Remove extra adjectives like 'baked', 'thawed', 'cleaned' etc.
extra_adjectives = set(line.strip() for line in open('../data/interim/extra_adjectives_list.txt'))

# Remove some extra words like 'assorted', 'approximately' etc. QUESTION: Is there a smart way to remove the top 100 such words?
extra_words = set(line.strip() for line in open('../data/interim/extra_words_list.txt'))

In [82]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

"""
custom tokenizer examples are in 0604_nlp_part2a_beta and 0531_text_vect_redux and 0531_Text_Data
"""
my_stops = set(ENGLISH_STOP_WORDS) | measurements | extra_adjectives | extra_words

def my_tokenizer(text):
    # convert to lowercase
    text = text.lower()
    # break into characters and weed out punctuation etc.  (include space!)
    chars = list(char for char in text if char in "abcdefghijklmnopqrstuvwxyz ")
    # make back into a single string
    text = "".join(chars)
    # break into words and weed out stop words and short words < 3 characters
    text = list(word for word in text.split() if word not in my_stops and len(word) >=3)
    return text

my_tokenizer(df['ingredientsStr'][1])

['whipping',
 'cream',
 'onions',
 'salt',
 'bay',
 'leaves',
 'cloves',
 'garlic',
 'clove',
 'pepper',
 'nutmeg',
 'thyme',
 'shallots',
 'butter',
 'boneless',
 'center',
 'pork',
 'loin',
 'sinew',
 'cut',
 'chunks',
 'chilled',
 'eggs',
 'purpose',
 'flour',
 'tawny',
 'port',
 'currants',
 'lettuce',
 'leaves',
 'peppercorns',
 'fresh',
 'parsley',
 'bay',
 'leaves',
 'french',
 'bread',
 'baguette',
 'slices',
 'olive',
 'oil',
 'red',
 'onions',
 'halved',
 'currants',
 'red',
 'wine',
 'vinegar',
 'canned',
 'chicken',
 'broth',
 'fresh',
 'thyme',
 'sugar']

### Vectoize ingredientsStr using the custom tokenizer

In [83]:
vect = CountVectorizer(tokenizer=my_tokenizer,
                       min_df=5)
vect.fit(df['ingredientsStr'])
vocab = vect.get_feature_names_out()
text_transformed = vect.transform(df['ingredientsStr'])



In [84]:
len(vocab)

2168

In [85]:
type(vocab)

numpy.ndarray

In [86]:
#vocab.tofile("vocab",sep=" ", format="%s")

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# vectorizer = TfidfVectorizer(stop_words = "english", min_df=2)
# vectorizer = TfidfVectorizer(stop_words = "english")
# vectorizer = CountVectorizer(stop_words = "english")
vectorizer = CountVectorizer(tokenizer=my_tokenizer,
                       min_df=5)

ingredients_matrix = vectorizer.fit_transform(df['ingredientsStr'])

In [88]:
ingredients_matrix.shape

(14526, 2168)

In [89]:
ingredients_matrix

<14526x2168 sparse matrix of type '<class 'numpy.int64'>'
	with 344380 stored elements in Compressed Sparse Row format>

In [90]:
# 'Artichoke and Parmesan Risotto' and 'Chicken Parmesan' are two recipes we will use to test for now
df[df['title'].str.contains('Chicken Parmesan', na=False)]

Unnamed: 0,recipeId,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
4918,6368,1842.0,5.0,Chicken Parmesan,['Place breadcrumbs and flour in 2 separate sh...,"['Chicken', 'Tomato', 'Broil', 'Kid-Friendly',...","['2 cups fine dry breadcrumbs', '1 cup all-pur..."
7482,9873,3392.0,3.75,Chicken Parmesan Heros,['Heat olive oil in a 4- to 5-quart heavy sauc...,"['Sandwich', 'Cheese', 'Chicken', 'Poultry', '...","['3 tablespoons olive oil', '1 small onion, fi..."
9982,13372,610.0,4.375,New Chicken Parmesan,['Preheat oven to 500° F. Whisk first 3 ingred...,"['Chicken', 'Tomato', 'Roast', 'Kid-Friendly',...","['1/3 cup extra-virgin olive oil', '2 large ga..."
12825,17557,917.0,5.0,Quick Baked Chicken Parmesan,['Arrange racks in top and bottom of oven and ...,"['22-Minute Meals', 'Chicken', 'Parmesan', 'To...","['2 large eggs', '1 1/2 cups breadcrumbs or pa..."


In [91]:
df[df['title'].str.contains('Chicken', na=False)]

Unnamed: 0,recipeId,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
25,35,625.0,3.750,Aztec Chicken,['Melt 2 tablespoons butter with vegetable oil...,"['Chicken', 'Olive', 'Onion', 'Sauté', 'Dinner...","['6 tablespoons (3/4 stick) chilled butter', '..."
37,53,1203.0,5.000,Pancetta Roast Chicken with Walnut Stuffing,['Preheat oven to 400°F. Melt 1/4 cup butter i...,"['Chicken', 'Roast', 'High Fiber', 'Dinner', '...","['8 tablespoons (1 stick) butter, divided', 'C..."
61,80,1172.0,4.375,"Braised Chicken and Rice with Orange, Saffron,...",['Rinse the rice in a sieve under cold running...,"['Chicken', 'Citrus', 'Fruit', 'Nut', 'Poultry...","['1 1/2 cups brown basmati rice', '1/4 cup oli..."
63,82,682.0,4.375,Chicken in Green Pumpkin-Seed Sauce,['Bring all ingredients to boil in large pot. ...,"['Chicken', 'Low/No Sugar', 'Cinco de Mayo', '...","['5 cups water', '6 chicken thighs with skin a..."
67,87,1143.0,0.000,Roast Chicken With Sorghum and Squash,['Bring 5 cups water to a boil in a medium pot...,"['Bon Appétit', 'Dinner', 'Chicken', 'Grains',...","['Kosher salt', '1 cup sorghum', '1/2 large bu..."
...,...,...,...,...,...,...,...
14445,20007,936.0,4.375,"Braised Chicken with Smoked Ham, Chestnuts, an...","['Bring water to a simmer in a small saucepan,...","['Chicken', 'Ginger', 'Braise', 'Marinate', 'D...","['2 3/4 cups water', '12 dried Chinese black m..."
14473,20050,356.0,3.750,Hot Chicken Salad,['1. Preheat the oven to 375°F. Spray a 13-by-...,"['Cheese', 'Chicken', 'Nut', 'Poultry', 'Bake'...","['2 cups cooked chicken breast meat, cubed (Yo..."
14475,20052,878.0,4.375,Chicken Tetrazzini,"['Bring chicken bones, broth, carrot, onion, c...","['Chicken', 'Mushroom', 'Pasta', 'Bake', 'Supe...",['1 to 1 1/2 pound chicken bones (from 2 cooke...
14497,20091,1096.0,3.750,Chicken with Raisins and Lemon,['Arrange chicken in single layer in large Dut...,"['Chicken', 'Potato', 'Poultry', 'Lemon', 'Rai...","['1 3 1/2-pound chicken, cut into 8 pieces', '..."


## Method 1 : Similarity matrix

Got these steps from the Notebook for the upcoming Recommendation Systems class.

In [92]:
ingredients_matrix[(df['title'] == 'Chicken Parmesan').values].todense().squeeze()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [93]:
from sklearn.metrics.pairwise import cosine_similarity

recipe_1 = ingredients_matrix[(df['title'] == 'Chicken Parmesan').values,]
recipe_2 = ingredients_matrix[(df['title'] == 'Chicken Parmesan Heros').values,]

print("Similarity:", cosine_similarity(recipe_1, recipe_2)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index

Similarity: [[0.34776522]]


Using cosine_similarity, 'Chicken Parmesan' and 'Chicken Parmesan Heros' recipes are 34% similar.

In [94]:
recipe_3 = ingredients_matrix[(df['title'] == 'Artichoke and Parmesan Risotto').values,]
print("Similarity:", cosine_similarity(recipe_1, recipe_3))

Similarity: [[0.20628425]]


Using cosine_similarity, 'Chicken Parmesan' and 'Artichoke and Parmesan Risotto' recipes are 20% similar.

Looking at the actual recipes in the dataset this output makes sense.

Let's create a similarity matrix by doing cosine_similarity on the entire ingredients sparse matrix.

In [95]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(ingredients_matrix, dense_output=False)

In [96]:
# Check the shape
# rows and columns should be equal, and the number of movies we started with (rows)
similarities.shape

(14526, 14526)

In [97]:
# Test with a sample recipe
df[df['title'] == 'Chicken Parmesan']

Unnamed: 0,recipeId,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
4918,6368,1842.0,5.0,Chicken Parmesan,['Place breadcrumbs and flour in 2 separate sh...,"['Chicken', 'Tomato', 'Broil', 'Kid-Friendly',...","['2 cups fine dry breadcrumbs', '1 cup all-pur..."


In [98]:
# Get the column based upon the index
recipe_index = df[df['title'] == 'Chicken Parmesan'].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'recipe': df['title'],
                       'similarity': np.array(similarities[recipe_index, :].todense()).squeeze()})

In [99]:
# Return the top 10 most similar recipes
sim_df.sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0,recipe,similarity
4918,Chicken Parmesan,1.0
12825,Quick Baked Chicken Parmesan,0.553191
6860,Chicken and Dumplings with Mushrooms,0.522931
11718,Fried Chicken Biscuits,0.520939
13533,Fusilli with Shrimp and Paneed Chicken,0.517769
1937,East-West Barbecued Chicken,0.506168
8877,"Pepper, Rosemary, and Cheese Bread",0.505992
7471,Fried Chicken Thighs with Cheesy Grits,0.503038
4490,Parmesan Muffins,0.501956
14180,BA's Best Eggplant Parmesan,0.500428


With TFIDF Vectorizer:

| recipe |                                       similarity |          |
|-------:|-------------------------------------------------:|----------|
|   4918 |                                 Chicken Parmesan | 1.000000 |
|  11718 |                           Fried Chicken Biscuits | 0.424981 |
|  12825 |                     Quick Baked Chicken Parmesan | 0.424046 |
|   1165 |                          Mozzarella Pesto Spread | 0.412968 |
|   9877 |         Rigatoni with Cheese and Italian Sausage | 0.360720 |
|  14180 |                      BA's Best Eggplant Parmesan | 0.356189 |
|   5786 |                         Horseradish-Yogurt Sauce | 0.351933 |
|   3927 | Chunky Two-Cheese Potatoes with Garlic and Pesto | 0.333565 |
|  13533 |           Fusilli with Shrimp and Paneed Chicken | 0.331478 |
|   3123 |    Spicy Lamb Pizza With Parsley–Red Onion Salad | 0.328433 |

Results with CountVectorizer seem better:

|       |                                            recipe | similarity |
|------:|--------------------------------------------------:|------------|
|  4918 |                                  Chicken Parmesan |   1.000000 |
| 13533 |            Fusilli with Shrimp and Paneed Chicken |   0.587137 |
| 12480 |                          Spicy Oven-Fried Chicken |   0.574641 |
| 11718 |                            Fried Chicken Biscuits |   0.559149 |
| 14180 |                       BA's Best Eggplant Parmesan |   0.557207 |
|  7749 |            Lamb and Eggplant Casserole (Moussaka) |   0.556499 |
| 11245 |                                             Pinon |   0.548443 |
|  9785 | Breaded Skinless Fish Fillets with Red Pepper ... |   0.540222 |
|  6860 |              Chicken and Dumplings with Mushrooms |   0.539425 |
| 11336 | Crispy Chicken Sandwich with Buttermilk Slaw a... |   0.539164 |

TODO: Ana check these results - something does not make sense, should I try with CountVectorizer? Still getting some weird results like Pinon.

Tried with custom tokenizer that removed most measurements and adjectives - TODO see if there is a library for this e.g. https://stackoverflow.com/questions/33587667/extracting-all-nouns-from-a-text-file-using-nltk

The results are better now:
|   |       |                                 recipe | similarity |
|---|------:|---------------------------------------:|------------|
|   |  4918 |                       Chicken Parmesan |   1.000000 |
|   | 12825 |           Quick Baked Chicken Parmesan |   0.553191 |
|   |  6860 |   Chicken and Dumplings with Mushrooms |   0.522931 |
|   | 11718 |                 Fried Chicken Biscuits |   0.520939 |
|   | 13533 | Fusilli with Shrimp and Paneed Chicken |   0.517769 |
|   |  1937 |            East-West Barbecued Chicken |   0.506168 |
|   |  8877 |     Pepper, Rosemary, and Cheese Bread |   0.505992 |
|   |  7471 | Fried Chicken Thighs with Cheesy Grits |   0.503038 |
|   |  4490 |                       Parmesan Muffins |   0.501956 |
|   | 14180 |            BA's Best Eggplant Parmesan |   0.500428 |

In [100]:
selector = (df['title'] == 'Chicken Parmesan') | (df['title'] == 'Fried Chicken Biscuits') | (df['title'] == 'Pinon')
df.loc[selector,['title', 'ingredientsStr']].values

array([['Chicken Parmesan',
        '[\'2 cups fine dry breadcrumbs\', \'1 cup all-purpose flour\', \'4 large eggs\', \'1 cup whole milk\', \'8 small skinless, boneless chicken thighs, pounded to 1/2" thickness\', \'Kosher salt, freshly ground pepper\', \'N/A freshly ground pepper\', \'8 tablespoons olive oil\', \'8 tablespoons prepared sun-dried tomato pesto\', \'1 pound fresh mozzarella, cut into 8 slices\', \'1/2 teaspoon crushed red pepper flakes\', \'4 cups prepared marinara sauce, warmed\', \'Finely grated Parmesan (for serving)\']'],
       ['Pinon',
        "['1 medium onion', '1/2 small green bell pepper', '1/2 small red bell pepper', 'a 14- to 16-ounce can whole tomatoes', '1/3 cup drained pimiento-stuffed green olives', '1 pound ground beef chuck', '1/4 teaspoon salt', '1/4 teaspoon freshly ground black pepper', '1/2 cup tomato sauce', '2 tablespoons raisins', '1 tablespoon cider vinegar', '2 bay leaves', '1/4 teaspoon ground achiote (optional)', '6 semi-ripe (yellow with so

In [101]:
# Check for Aritchoke
# Get the column based upon the index
recipe_index_2 = df[df['title'] == 'Artichoke and Parmesan Risotto'].index

# Create a dataframe with the movie titles
sim_df_2 = pd.DataFrame({'recipe': df['title'],
                       'similarity': np.array(similarities[recipe_index_2, :].todense()).squeeze()})

# Return the top 10 most similar recipes
sim_df_2.sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0,recipe,similarity
14522,Artichoke and Parmesan Risotto,1.0
6459,Asparagus Risotto,0.75724
10936,Risotto with Squash and Pancetta,0.700301
5060,Shrimp Risotto with Baby Spinach and Basil,0.6994
4857,Radicchio Risotto,0.659966
10112,Porcini Mushroom Risotto,0.646762
11204,Asparagus and Leek Risotto with Prosciutto,0.625779
5028,Celery Root Risotto and Pesto,0.625463
11013,Spinach Risotto,0.625463
8493,"Butternut Squash, Rosemary, and Blue Cheese Ri...",0.625


Results look satisfactory.

## Method 2 - Using NearestNeighbors

In [102]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=11, metric='cosine')
model.fit(ingredients_matrix)


In [103]:
recipe_index = df[df['title'] == 'Chicken Parmesan'].index
distances, indices = model.kneighbors(ingredients_matrix[recipe_index])

In [104]:
# recipe_title = df.loc[4918, ['title']]
recipe_titles = []
for id in indices[0]:
    recipe_titles.append(df.loc[id, ['title']])
print(recipe_titles)

[title    Chicken Parmesan
Name: 4918, dtype: object, title    Quick Baked Chicken Parmesan
Name: 12825, dtype: object, title    Chicken and Dumplings with Mushrooms
Name: 6860, dtype: object, title    Fried Chicken Biscuits
Name: 11718, dtype: object, title    Fusilli with Shrimp and Paneed Chicken
Name: 13533, dtype: object, title    East-West Barbecued Chicken
Name: 1937, dtype: object, title    Pepper, Rosemary, and Cheese Bread
Name: 8877, dtype: object, title    Fried Chicken Thighs with Cheesy Grits
Name: 7471, dtype: object, title    Parmesan Muffins
Name: 4490, dtype: object, title    BA's Best Eggplant Parmesan
Name: 14180, dtype: object, title    Chile-and-Olive-Oil-Fried Egg with Avocado and...
Name: 627, dtype: object]


In [105]:
recipe_index_2 = df[df['title'] == 'Artichoke and Parmesan Risotto'].index

distances2, indices2 = model.kneighbors(ingredients_matrix[recipe_index_2])
recipe_titles2 = []
for id2 in indices2[0]:
    recipe_titles2.append(df.loc[id2, ['title']])
print(recipe_titles2)

[title    Artichoke and Parmesan Risotto
Name: 14522, dtype: object, title    Asparagus Risotto
Name: 6459, dtype: object, title    Risotto with Squash and Pancetta
Name: 10936, dtype: object, title    Shrimp Risotto with Baby Spinach and Basil
Name: 5060, dtype: object, title    Radicchio Risotto
Name: 4857, dtype: object, title    Porcini Mushroom Risotto
Name: 10112, dtype: object, title    Asparagus and Leek Risotto with Prosciutto
Name: 11204, dtype: object, title    Celery Root Risotto and Pesto
Name: 5028, dtype: object, title    Spinach Risotto
Name: 11013, dtype: object, title    Butternut Squash, Rosemary, and Blue Cheese Ri...
Name: 8493, dtype: object, title    Fontina Risotto Cakes with Fresh Chives
Name: 8366, dtype: object]


Results seem to be going in the right direction.

## Trying NearestNeighbors on categoriesStr

### Vectorize categoriesStr using the default tokenizer

In [106]:
# NOTE Using default tokenizer here
vect2 = CountVectorizer(stop_words="english")
vect2.fit(df['categoriesStr'])
vocab_categories = vect2.get_feature_names_out()
categories_matrix = vect2.transform(df['categoriesStr'])

In [107]:
len(vocab_categories)

701

In [108]:
vocab_categories

array(['22', '30', 'added', 'advance', 'alabama', 'alaska', 'alcoholic',
       'almond', 'amaretto', 'anchovy', 'angeles', 'anise', 'anniversary',
       'anthony', 'aperitif', 'app', 'appetizer', 'apple', 'appétit',
       'apricot', 'arizona', 'armagnac', 'artichoke', 'arugula', 'asian',
       'asparagus', 'aspen', 'atlanta', 'australia', 'avocado',
       'backyard', 'bacon', 'bake', 'banana', 'barbecue', 'barley',
       'basil', 'bass', 'bastille', 'bbq', 'beach', 'bean', 'beef',
       'beer', 'beet', 'bell', 'berry', 'beverly', 'birthday', 'biscuit',
       'bitters', 'blackberry', 'blender', 'blue', 'blueberry', 'boil',
       'boiler', 'bok', 'bon', 'boston', 'bourbon', 'bourdain', 'bowl',
       'braise', 'bran', 'brandy', 'bread', 'breadcrumbs', 'breakfast',
       'brie', 'brine', 'brisket', 'broccoli', 'broil', 'brooklyn',
       'brown', 'brownie', 'brunch', 'brussel', 'buffalo', 'buffet',
       'bulgaria', 'bulgur', 'burrito', 'butter', 'buttermilk',
       'butternut

In [109]:
categories_matrix.shape

(14526, 701)

In [110]:
model_cat = NearestNeighbors(n_neighbors=11, metric='cosine')
model_cat.fit(categories_matrix)
distances_cat1, indices_cat1 = model_cat.kneighbors(categories_matrix[recipe_index])
recipe_cat_titles1 = []
for idc1 in indices_cat1[0]:
    recipe_cat_titles1.append(df.loc[idc1, ['title']])
print(recipe_cat_titles1)


[title    Chicken Parmesan
Name: 4918, dtype: object, title    New Chicken Parmesan
Name: 9982, dtype: object, title    Chicken Schnitzel with Chile Cherry Tomatoes a...
Name: 7219, dtype: object, title    Chorizo Bolognese with Buffalo Mozzarella
Name: 6822, dtype: object, title    Broiled Chicken, Romaine, and Tomato Bruschetta
Name: 4994, dtype: object, title    Green Mountain Maple Barbecued Chicken
Name: 292, dtype: object, title    Baked Beans with Slab Bacon and Breadcrumbs
Name: 139, dtype: object, title    Turkey Burritos with Salsa and Cilantro
Name: 2568, dtype: object, title    Eggplant Parmesan With Fresh Mozzarella
Name: 10016, dtype: object, title    Lemon Chicken Cutlets
Name: 4441, dtype: object, title    Fried-Egg Caesar with Sun-Dried Tomatoes and P...
Name: 9787, dtype: object]


In [111]:
distances_cat2, indices_cat2 = model_cat.kneighbors(categories_matrix[recipe_index_2])
recipe_cat_titles2 = []
for idc2 in indices_cat2[0]:
    recipe_cat_titles2.append(df.loc[idc2, ['title']])
print(recipe_cat_titles2)

[title    Artichoke and Parmesan Risotto
Name: 14522, dtype: object, title    Poached Salmon with Artichoke Confit
Name: 1848, dtype: object, title    Roast Chicken with Rosemary, Lemon, and Honey
Name: 11438, dtype: object, title    Potato Salad with 7-Minute Eggs and Mustard Vi...
Name: 11357, dtype: object, title    Roast Chicken With Harissa And Schmaltz
Name: 12453, dtype: object, title    Slow-Roasted Char with Fennel Salad
Name: 1021, dtype: object, title    Beans with Kale and Portuguese Sausage
Name: 13014, dtype: object, title    Pot-Roasted Artichokes With White Wine and Capers
Name: 7465, dtype: object, title    Roasted Asparagus and Baby Artichokes with Lem...
Name: 6549, dtype: object, title    Dai Due's Master Brined Chicken
Name: 5189, dtype: object, title    Milk Pudding with Rose Water Caramel and Figs
Name: 2105, dtype: object]


## TODO List

In [112]:
import nltk

lines = 'lines is some string of words'
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN'
# do the nlp stuff
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]

print(nouns)

['lines', 'string', 'words']


In [113]:
# Fuzzy Matching and StreamLit tutorial
# My input can be a list of ingredients, or a full recipe name, test fuzzy matcher. How to make this work for an ingredient list instead of recipe name
# How to use categories for pulling out vegeterian recipes - try a query to see how many recipes have this category in them
# Also ask about tokenization

In [114]:
# fuzzy matching, let's put recipe names in a list
recipeNames = list(df['title'])

In [115]:
# %pip install joblib

## For StreamLit

In [116]:
# https://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn

"""
import pickle
# now you can save it to a file
with open('model1.pkl', 'wb') as f:
    pickle.dump(model, f)
"""
import joblib
# now you can save it to a file
joblib.dump(model, 'model_joblib.pkl')

['model_joblib.pkl']

In [117]:
joblib.dump(ingredients_matrix, 'ing_mat.pkl')

['ing_mat.pkl']