In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.feature_extraction.text import CountVectorizer

# custom tokenizer (moved to a separate module due to Streamlit requirements)
import cust_tokenizer


In [8]:
df = pd.read_csv('../data/interim/full_recipes_cleaned_2.csv')
df.shape

(14526, 7)

In [9]:
df.set_index('recipeId')

Unnamed: 0_level_0,calories,rating,title,directionsStr,categoriesStr,ingredientsStr
recipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,426.0,2.500,"Lentil, Apple, and Turkey Wrap","['1. Place the stock, lentils, celery, carrot,...","['Sandwich', 'Bean', 'Fruit', 'Tomato', 'turke...",['4 cups low-sodium vegetable or chicken stock...
1,403.0,4.375,Boudin Blanc Terrine with Red Onion Confit,['Combine first 9 ingredients in heavy medium ...,"['Food Processor', 'Onion', 'Pork', 'Bake', 'B...","['1 1/2 cups whipping cream', '2 medium onions..."
2,165.0,3.750,Potato and Fennel Soup Hodge,['In a large heavy saucepan cook diced fennel ...,"['Soup/Stew', 'Dairy', 'Potato', 'Vegetable', ...","['1 fennel bulb (sometimes called anise), stal..."
4,547.0,3.125,Spinach Noodle Casserole,['Preheat oven to 350°F. Lightly grease 8x8x2-...,"['Cheese', 'Dairy', 'Pasta', 'Vegetable', 'Sid...","['1 12-ounce package frozen spinach soufflé, t..."
5,948.0,4.375,The Best Blts,"['Mix basil, mayonnaise and butter in processo...","['Sandwich', 'Food Processor', 'Tomato', 'Kid-...",['2 1/2 cups (lightly packed) fresh basil leav...
...,...,...,...,...,...,...
20125,28.0,3.125,Parmesan Puffs,['Beat whites in a bowl with an electric mixer...,"['Mixer', 'Cheese', 'Egg', 'Fry', 'Cocktail Pa...","['2 large egg whites', '3 oz Parmigiano-Reggia..."
20126,671.0,4.375,Artichoke and Parmesan Risotto,['Bring broth to simmer in saucepan.Remove fro...,"['Side', 'Kid-Friendly', 'High Fiber', 'Dinner...",['5 1/2 cups (or more) low-salt chicken broth'...
20127,563.0,4.375,Turkey Cream Puff Pie,"['Using a sharp knife, cut a shallow X in bott...","['Onion', 'Poultry', 'turkey', 'Vegetable', 'B...","['1 small tomato', '1 small onion, finely chop..."
20128,631.0,4.375,Snapper on Angel Hair with Citrus Cream,['Heat 2 tablespoons oil in heavy medium skill...,"['Milk/Cream', 'Citrus', 'Dairy', 'Fish', 'Gar...","['4 tablespoons olive oil', '4 shallots, thinl..."


In [10]:
# confirm that there are no null values or duplicated values
print(f"Null values: {df.isna().sum().sum()}")
print(f"Duplicated rows: {df.duplicated().sum()}")

Null values: 0
Duplicated rows: 0


## Custom vocabulary

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=(cust_tokenizer.my_tokenizer),
                       min_df=5)
ingredients_matrix = vectorizer.fit_transform(df['ingredientsStr'])




In [12]:
# now we have the vocabulary
print(type(vectorizer.vocabulary_))
print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))
print(type(vectorizer.get_feature_names_out()))
print(len(vectorizer.get_feature_names_out()))

<class 'dict'>
{'lowsodium': 1152, 'vegetable': 2072, 'chicken': 333, 'stock': 1967, 'brown': 210, 'lentils': 1100, 'french': 760, 'green': 858, 'celery': 303, 'carrot': 285, 'sprig': 1947, 'thyme': 2028, 'kosher': 1055, 'salt': 1752, 'tomato': 2041, 'cored': 449, 'seeded': 1803, 'diced': 576, 'fuji': 776, 'apple': 37, 'freshly': 764, 'lemon': 1092, 'juice': 1012, 'extravirgin': 667, 'olive': 1361, 'oil': 1354, 'pepper': 1460, 'taste': 2013, 'sheets': 1844, 'wholewheat': 2116, 'lavash': 1073, 'half': 885, 'crosswise': 494, 'flour': 734, 'tortillas': 2046, 'turkey': 2061, 'breast': 193, 'head': 910, 'bibb': 109, 'lettuce': 1101, 'whipping': 2102, 'cream': 478, 'onions': 1365, 'bay': 81, 'leaves': 1081, 'cloves': 389, 'garlic': 788, 'clove': 387, 'nutmeg': 1345, 'shallots': 1829, 'butter': 235, 'boneless': 157, 'center': 305, 'pork': 1546, 'loin': 1139, 'sinew': 1877, 'chunks': 369, 'chilled': 345, 'eggs': 633, 'purpose': 1601, 'tawny': 2014, 'port': 1547, 'currants': 525, 'peppercorns':

In [13]:
def printMostFrequentWords():
    # this function will sort the vocab and print the 50 most frequent words
    sum_words = ingredients_matrix.sum(axis = 0)
    words_freq = [(word, sum_words[0, i]) for word, i in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    # comment the above line and uncomment below to see least frequent words
    # words_freq = sorted(words_freq, key = lambda x: x[1])
    print(words_freq[0:50])

# call the function
printMostFrequentWords()

[('oil', 8486), ('salt', 7708), ('sugar', 7120), ('butter', 5576), ('olive', 5399), ('pepper', 5304), ('juice', 5078), ('garlic', 4726), ('lemon', 4256), ('red', 3793), ('cream', 3782), ('cloves', 3344), ('leaves', 3307), ('flour', 3239), ('onion', 3045), ('pieces', 2991), ('chicken', 2817), ('vinegar', 2649), ('freshly', 2598), ('stick', 2481), ('cheese', 2456), ('vegetable', 2445), ('green', 2260), ('dry', 2244), ('wine', 2231), ('packed', 2228), ('coarsely', 2131), ('egg', 2109), ('parsley', 2058), ('kosher', 2042), ('slices', 2030), ('sauce', 2013), ('halved', 1988), ('eggs', 1971), ('broth', 1899), ('powder', 1843), ('orange', 1823), ('vanilla', 1804), ('tomatoes', 1734), ('seeds', 1710), ('milk', 1668), ('lime', 1640), ('onions', 1618), ('drained', 1570), ('extravirgin', 1519), ('thyme', 1477), ('allpurpose', 1466), ('ginger', 1401), ('brown', 1366), ('lengthwise', 1351)]


In [14]:
# save the vocab to a file
# with open("ingvect", "w") as outfile:
#     outfile.write("\n".join(vectorizer.get_feature_names_out()))

In [15]:
# # TODO: THIS STEP IS MANUAL, DO IT PROGRAMATICALLY
# # Load the updated vocab to a list
# new_vocab_list = None
# with open('ingvect_mod') as f:
#     new_vocab_list = f.read().splitlines()
# len(new_vocab_list)

In [16]:
new_vocab_list = list(vectorizer.get_feature_names_out())
new_vocab_list.append("Peanut butter")
len(new_vocab_list)

2161

In [17]:
# Let's set this new vocab as a hyperparameter of the vectorizer
vectorizer_mod = CountVectorizer(tokenizer=(cust_tokenizer.my_tokenizer),
                       min_df=5, vocabulary=new_vocab_list)
# not calling fit because sending custom vocab..
ingredients_matrix_mod = vectorizer_mod.transform(df['ingredientsStr'])


In [18]:
# Adding a function to print the list of ingredients given a recipe title
def printIngredients(recipeName):
    sel = df['title'] == recipeName
    print(recipeName)
    print(df.loc[sel, ['ingredientsStr']].values)


In [19]:
# from sklearn.neighbors import NearestNeighbors

# model = NearestNeighbors(n_neighbors=11, metric='cosine')
# model.fit(ingredients_matrix)



In [20]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=11, metric='cosine')
model.fit(ingredients_matrix_mod)

In [21]:
# Using various ingredient lists to test the results

ingInputList = [
    # "Chicken, Parmesan, Breadcrumbs",  # something familiar
    # "Artichoke Pesto",
    # "Chicken thighs, potatoes",  # compare results of potatoes vs potato
    # "Chicken thighs, potato",
    # "Okra",  # single ingredient
    # "Bhindi",  # unknown ingredient - does not exist in the vocabulary
    "Peanut butter"  # This is a 2 word ingredient
]

for ingInput in ingInputList:
    print(f"\n Input ingredients: {ingInput}")
    # Convert the string to a series
    ingInputSeries = pd.Series(ingInput)

    # Let's try to use the vectorizer on this
    ingTransformed = vectorizer_mod.transform(ingInputSeries)

    # pass this to NearestNeighbors trained model
    distOfRes, indicesOfRes = model.kneighbors(ingTransformed)

    # print the output
    print("\n Result")

    for i in range(0, 11):  # TODO: 11 should be made configurable and match the n-neighbors number
        name = df.loc[indicesOfRes[0][i], ['title']].values[0]
        distance = (distOfRes[0][i]).round(3)
        rating = df.loc[indicesOfRes[0][i], ['rating']].values[0]

        # print(f"{name}  :  {distance}")
        print(f"{name}  :  {distance}  :  {rating}")



 Input ingredients: Peanut butter

 Result
To Clarify Butter  :  0.293  :  5.0
Peanut Butter and Banana Sandwiches  :  0.36  :  3.75
Peanut Butter Cheesecake with Peanut Brittle  :  0.411  :  3.75
Peanut Butter and Jelly Layered Sandwiches  :  0.423  :  4.375
Peanut Punch  :  0.423  :  0.0
Peanut Butter, Banana, and Jelly "Ice Cream"  :  0.465  :  0.0
Shallot Butter  :  0.5  :  5.0
Whole-Wheat Peanut Butter Waffles  :  0.513  :  3.75
Milk Chocolate Peanut Butter Sauce  :  0.513  :  2.5
Easy Crepes  :  0.529  :  5.0
Peanut Butter Chocolate Ripple Ice Cream  :  0.529  :  4.375


In [22]:
printIngredients("To Clarify Butter")
printIngredients("Peanut Punch")

To Clarify Butter
[["['Unsalted butter']"]]
Peanut Punch
[["['2 tablespoons cornstarch', '1/2 cup water', '2 cups milk', '6 tablespoons peanut butter', 'Sugar to taste']"]]


In [23]:
recipe_index = df[df['title'] == 'To Clarify Butter'].index
ingredients_matrix_mod[recipe_index].sum(axis=1)

matrix([[1]], dtype=int64)

### Most frequent bigrams

In [24]:
# Get bigrams out and see the most common ones
from sklearn.feature_extraction.text import CountVectorizer

vect_bg = CountVectorizer(tokenizer=(cust_tokenizer.my_tokenizer),
                       min_df=5, ngram_range=(2,2))
ing_bg_mat = vect_bg.fit_transform(df['ingredientsStr'])



In [25]:
def printMostFrequentBigrams():
    # this function will sort the vocab and print the 50 most frequent words
    sum_words = ing_bg_mat.sum(axis = 0)
    words_freq = [(word, sum_words[0, i]) for word, i in vect_bg.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    # comment the above line and uncomment below to see least frequent words
    # words_freq = sorted(words_freq, key = lambda x: x[1])
    print(words_freq[0:50])

# call the function
printMostFrequentBigrams()

[('olive oil', 5344), ('garlic cloves', 2617), ('lemon juice', 2415), ('kosher salt', 2008), ('vegetable oil', 2001), ('stick butter', 1901), ('freshly pepper', 1546), ('extravirgin olive', 1473), ('allpurpose flour', 1457), ('chicken broth', 1432), ('brown sugar', 1245), ('purpose flour', 1192), ('whipping cream', 1103), ('salt pepper', 1090), ('lime juice', 1056), ('salt freshly', 1030), ('vanilla extract', 989), ('heavy cream', 953), ('garlic clove', 894), ('dry wine', 868), ('lowsalt chicken', 813), ('bell pepper', 736), ('soy sauce', 730), ('dijon mustard', 716), ('wine vinegar', 707), ('red wine', 703), ('sour cream', 688), ('red onion', 671), ('red bell', 665), ('butter melted', 661), ('red pepper', 660), ('orange juice', 636), ('butter room', 627), ('halved lengthwise', 615), ('egg yolks', 592), ('lemon peel', 578), ('inchthick slices', 568), ('sea salt', 557), ('green onions', 538), ('parsley leaves', 526), ('sticks butter', 498), ('butter pieces', 495), ('parmesan cheese', 48

In [26]:
vect_bg.vocabulary_['peanut butter']

6035

In [27]:
vect_bg.vocabulary_['olive oil']

5462

In [28]:
# sorted_vocab = sorted(vect_bg.vocabulary_.values(), reverse=True)
sorted_vocab = sorted(vect_bg.vocabulary_.items(), key=lambda x:x[1], reverse=True)

In [29]:
sorted_vocab[51:100]

[('yolks sugar', 9656),
 ('yolks sticks', 9655),
 ('yolks stick', 9654),
 ('yolks sour', 9653),
 ('yolks salt', 9652),
 ('yolks packed', 9651),
 ('yolks orange', 9650),
 ('yolks milk', 9649),
 ('yolks lime', 9648),
 ('yolks light', 9647),
 ('yolks lemon', 9646),
 ('yolks honey', 9645),
 ('yolks heavy', 9644),
 ('yolks granulated', 9643),
 ('yolks extravirgin', 9642),
 ('yolks equipment', 9641),
 ('yolks eggs', 9640),
 ('yolks egg', 9639),
 ('yolks cornstarch', 9638),
 ('yolks chilled', 9637),
 ('yolks butter', 9636),
 ('yolks beaten', 9635),
 ('yolks allpurpose', 9634),
 ('yolk whipping', 9633),
 ('yolk vanilla', 9632),
 ('yolk sugar', 9631),
 ('yolk purpose', 9630),
 ('yolk milk', 9629),
 ('yolk lightly', 9628),
 ('yolk ice', 9627),
 ('yolk heavy', 9626),
 ('yolk equipment', 9625),
 ('yolk beaten', 9624),
 ('yogurt vegetable', 9623),
 ('yogurt vanilla', 9622),
 ('yogurt sugar', 9621),
 ('yogurt sour', 9620),
 ('yogurt serving', 9619),
 ('yogurt salt', 9618),
 ('yogurt preferably', 961

Selecting common bigrams

chicken broth  
bell pepper  
soy sauce  

Not necessary to specify them actually. 


### 680 custom vocab

In [30]:
vocab_680 = None
with open('ing680') as f:
    vocab_680 = f.read().splitlines()
len(vocab_680)

668

In [31]:
# Let's set this new vocab as a hyperparameter of the vectorizer
vect_680 = CountVectorizer(tokenizer=(cust_tokenizer.my_tokenizer),
                       min_df=5, vocabulary=vocab_680)
# not calling fit because sending custom vocab..
ingredients_matrix_680 = vect_680.transform(df['ingredientsStr'])

from sklearn.neighbors import NearestNeighbors

model_680 = NearestNeighbors(n_neighbors=11, metric='cosine')
model_680.fit(ingredients_matrix_680)

In [32]:
# Using various ingredient lists to test the results

ingInputList = [
    "Chicken, Parmesan, Breadcrumbs",  # something familiar
    "Artichoke Pesto",
    "Chicken thighs, potatoes",  # compare results of potatoes vs potato
    "Chicken thighs, potato",
    "Okra",  # single ingredient
    "Bhindi",  # unknown ingredient - does not exist in the vocabulary
    "Peanut butter"  # This is a 2 word ingredient
]

for ingInput in ingInputList:
    print(f"\n Input ingredients: {ingInput}")
    # Convert the string to a series
    ingInputSeries = pd.Series(ingInput)

    # Let's try to use the vectorizer on this
    ingTransformed = vect_680.transform(ingInputSeries)

    # pass this to NearestNeighbors trained model
    distOfRes, indicesOfRes = model_680.kneighbors(ingTransformed)

    # print the output
    print("\n Result")

    for i in range(0, 11):  # TODO: 11 should be made configurable and match the n-neighbors number
        name = df.loc[indicesOfRes[0][i], ['title']].values[0]
        distance = (distOfRes[0][i]).round(3)
        rating = df.loc[indicesOfRes[0][i], ['rating']].values[0]

        # print(f"{name}  :  {distance}")
        print(f"{name}  :  {distance}  :  {rating}")



 Input ingredients: Chicken, Parmesan, Breadcrumbs

 Result
Parmesan Chicken with Mixed Baby Greens  :  0.304  :  4.375
Chicken Soup Verde  :  0.388  :  3.125
Breaded Chicken Cutlets with Chunky Vegetable Sauce  :  0.404  :  3.75
Linguine with Sausage, Mushroom and Cream Sauce  :  0.423  :  3.75
Chicken Breasts with Sun-Dried Tomato and Garlic Crust  :  0.423  :  3.75
Lemon-Pepper Chicken  :  0.423  :  4.375
Chicken Divan  :  0.423  :  3.75
Creamed Mushrooms, Onions, and Brussels Sprouts  :  0.423  :  4.375
Parmesan Polenta  :  0.423  :  4.375
Root Vegetable Gratin  :  0.452  :  4.375
Suzanne's Scalloppine  :  0.452  :  3.75

 Input ingredients: Artichoke Pesto

 Result
Artichoke and Olive Crostini  :  0.423  :  3.75
Artichoke Hearts with Garlic, Olive Oil and Parsley  :  0.5  :  3.125
Hot Artichoke and Tarragon Dip  :  0.5  :  3.75
Baked Chicken with Mushrooms and Artichokes  :  0.5  :  3.75
Grilled Fontina with Artichokes and Mushrooms  :  0.5  :  3.125
Roasted Pacific Cod with Spri

In [33]:
printIngredients("Sorrel, Pea, and Leek Soup")

Sorrel, Pea, and Leek Soup
[["['white and pale green parts of 3 leeks (about 3/4 pound), chopped, washed well, and drained', '1 1/2 tablespoons olive oil', '1 small boiling potato (about 1/4 pound)', '1 1/2 cups chicken broth', '1 1/2 cups cold water plus additional to thin soup', '1/2 cup shelled fresh or thawed frozen peas', '1/4 pound sorrel*, stems discarded and leaves washed, spun dry, and cut crosswise into thin strips (about 3 cups loosely packed)', '1/3 cup sour cream', '1 teaspoon fresh lemon juice, or to taste', 'Garnish: chopped hard-boiled egg and thin strips of sorrel', 'available seasonally at some supermarkets and specialty produce markets']"]]


## Spell Check

In [34]:
# spell checker - https://www.geeksforgeeks.org/correcting-words-using-nltk-in-python/
# importing the nltk suite
import nltk

# importing jaccard distance
# and ngrams from nltk.util
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

correct_words = vectorizer.get_feature_names_out()

# that need to be corrected
incorrect_words=['brocolli', 'mozarela', 'cheese', 'ladyfinger', 'brinjal', 'tomatoe', 'potatoes',
'tomatoe','fish','egg','octopuses','oyster','fetas']

# loop for finding correct spellings
# based on jaccard distance
# and printing the correct word
for word in incorrect_words:
    temp = [(jaccard_distance(set(ngrams(word, 2)),
                              set(ngrams(w, 2))),w)
            for w in correct_words if w[0]==word[0]]
    print(sorted(temp, key = lambda val:val[0])[0][1])

broccoli
mozzarella
cheese
ladyfingers
brine
tomatoes
potatoes
tomatoes
fish
egg
octopus
oyster
feta


In [35]:
# spell checker - https://www.geeksforgeeks.org/correcting-words-using-nltk-in-python/
# importing the nltk suite
import nltk

# importing jaccard distance
# and ngrams from nltk.util
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

correct_words = vectorizer.get_feature_names_out() # or just put vocab variable here

# that need to be corrected
# incorrect_words=['brocolli', 'mozarela', 'cheese', 'ladyfinger', 'brinjal', 'tomatoe', 'potatoes','fish','egg','octopi','oyster','fetas']
incorrect_words = ['tomato', 'oysterss', 'potato', 'octopodes', 'fetas', 'eggss', 'oyster', 'octopu', 'tomatoess', 'egg', 'potatoes', 'fetum', 'fish']
# incorrect_words=['fetas']
# incorrect_words=['cheese']

# loop for finding correct spellings
# based on jaccard distance
# and printing the correct word

final_words = []
for word in incorrect_words:
    temp = [(jaccard_distance(set(ngrams(word, 2)),
                              set(ngrams(w, 2))),w)
            for w in correct_words if w[0]==word[0]]
    sorted_temp = sorted(temp, key = lambda val:val[0])
    word_distance = sorted_temp[0][0]
    corrected_word = sorted_temp[0][1]
    # print(word_distance)
    # print(corrected_word)
    if word_distance > 0.5:
        print(f"Sorry we could not find recipes with the ingredient {word}")
    else:
        final_words.append(corrected_word)
print(final_words)
    # print(sorted(temp, key = lambda val:val[0])[0][1])

Sorry we could not find recipes with the ingredient octopodes
Sorry we could not find recipes with the ingredient fetum
['tomato', 'oysters', 'potato', 'feta', 'eggs', 'oyster', 'octopus', 'tomatoes', 'egg', 'potatoes', 'fish']


words = ["potato", "tomatoes", "fish", "eggs", "octopus", "oysters", "feta"]

I like the way the nltk works better than inflect even if it requires more setup.

## Plurals

### Using `inflect` package

In [17]:
# plurals
# %pip install inflect


In [18]:
import inflect

p = inflect.engine()
res = []
words = ["potato", "tomatoes", "fish", "eggs", "octopus", "oysters", "feta"]
for word in words:
    print(p.plural(word))

potatoes
tomatoe
fish
egg
octopuses
oyster
fetas


### Using `nltk` and `pattern-en` package

In [63]:
# %pip install pattern

Collecting pattern
  Downloading Pattern-3.6.0.tar.gz (22.2 MB)
     ---------------------------------------- 0.0/22.2 MB ? eta -:--:--
     ---------------------------------------- 0.0/22.2 MB ? eta -:--:--
     --------------------------------------- 0.0/22.2 MB 393.8 kB/s eta 0:00:57
     ---------------------------------------- 0.2/22.2 MB 1.8 MB/s eta 0:00:13
     - -------------------------------------- 1.0/22.2 MB 5.6 MB/s eta 0:00:04
     ---- ----------------------------------- 2.6/22.2 MB 11.2 MB/s eta 0:00:02
     -------- ------------------------------- 4.5/22.2 MB 16.0 MB/s eta 0:00:02
     ----------- ---------------------------- 6.6/22.2 MB 20.1 MB/s eta 0:00:01
     --------------- ------------------------ 8.4/22.2 MB 22.4 MB/s eta 0:00:01
     ------------------ -------------------- 10.8/22.2 MB 38.5 MB/s eta 0:00:01
     -------------------- ------------------ 12.0/22.2 MB 38.5 MB/s eta 0:00:01
     ------------------------ -------------- 14.0/22.2 MB 43.7 MB/s eta 0:

In [1]:
# Python program to pluralize a given
# word using pattern-en package

# Import the NLTK module
from pattern.en import pluralize
from pattern.en import singularize
import nltk

# Installing NLTK data to import
# and run en module of pattern
nltk.download('popular')

# Importing the pattern en module



[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nl

children


[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\anami\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection popular


In [2]:
# Define the word and make it plural
# by using pluralize() function
print(pluralize('child'))

children


In [5]:
words = ["potato", "tomatoes", "fish", "eggs", "octopus", "oysters", "feta"]
res1 = []
for word in words:
    # print(pluralize(word))
    res1.append(pluralize(word))
    res1.append(singularize(word))
print(set(res1))

{'tomato', 'oysterss', 'potato', 'octopodes', 'fetas', 'eggss', 'oyster', 'octopu', 'tomatoess', 'egg', 'potatoes', 'fetum', 'fish'}


Workflow for spell check and plurals

1. User enters list of ingredients to include and exclude
2. Call the spell checker, and correct the ingredients, print a message if an ingredient is not found. Can add a flag to check where the list is from and print to screen or not.
3. From the corrected ingredients, singularize and pluralize everything, and make a set.
4. Run this once more through the spell checker, to elimiate bad pluralization / singularization
5. Now the list is ready to be vectorized, and sent to the model.

In [19]:
# Including ingredients only
yes_ing_series = pd.Series("Okra")
yes_ing_tx = vectorizer.transform(yes_ing_series)

distOfRes, indicesOfRes = model.kneighbors(yes_ing_tx)

# print the output
print("\n Result")

for i in range(0, 11):  # TODO: 11 should be made configurable and match the n-neighbors number
    name = df.loc[indicesOfRes[0][i], ['title']].values[0]
    distance = (distOfRes[0][i]).round(3)
    rating = df.loc[indicesOfRes[0][i], ['rating']].values[0]

    print(f"{name}  :  {distance}")
    # print(f"{name}  :  {distance}  :  {rating}")

ValueError: X has 2158 features, but NearestNeighbors is expecting 2170 features as input.

In [None]:
# Including and excluding ingredients
yes_ing_series = pd.Series("Okra")
no_ing_series = pd.Series("Tomato Cilantro")

yes_ing_tx = vectorizer.transform(yes_ing_series)
no_ing_tx = (vectorizer.transform(no_ing_series)) * -1

updated_ing_tx = yes_ing_tx + no_ing_tx

distOfRes, indicesOfRes = model.kneighbors(updated_ing_tx)

# print the output
print("\n Result")

for i in range(0, 11):  # TODO: 11 should be made configurable and match the n-neighbors number
    name = df.loc[indicesOfRes[0][i], ['title']].values[0]
    distance = (distOfRes[0][i]).round(3)
    rating = df.loc[indicesOfRes[0][i], ['rating']].values[0]

    print(f"{name}  :  {distance}")
    # print(f"{name}  :  {distance}  :  {rating}")


 Result
Okra with Scallion, Lime, and Ginger  :  0.826
Chicken, Sausage, and Okra Gumbo  :  0.83
Broiled Tomato, Corn, and Okra  :  0.846
Stewed Corn and Tomatoes with Okra  :  0.864
Creole Chicken and Okra Gumbo  :  0.88
Succotash  :  0.885
Catfish and Okra with Pecan Butter Sauce  :  0.898
Crisp Okra in Yogurt Sauce  :  0.898
Chive Shortcakes with Smoky Corn and Okra Stew  :  0.902
Corn and Okra Stew  :  0.905
Spicy Gumbo-Laya  :  0.909
