# Build first model(s)
- explore Kaggle's "pre-processed" data to try it with some clustering models
- see [eda_recipe_database](eda_recipe_database.ipynb) notebook for initial exploration of this data
- I want to build a model that learns which ingredients tend to occur together
- then use that model to predict an ingredient, given the presence of other ingredients

## what's in thie notebook:
- convert strings into real lists
- dummify ingredient tokens
- KMeans model
- add cluster labels to raw recipes
- more regex for getting data to lists
- predict cluster for a new recipe
    - fetch some recipes from the Edamam API
    - map recipe ingredients to ingredient codes
    - dummify ingredients & get prediction
    - observations about the prediction


In [127]:
import re
import json
import importlib

import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import get_recipes, modeling

In [2]:
importlib.reload(get_recipes);

---
## pre-processed recipes
- starting with the tokenized data from kaggle

In [3]:
df = pd.read_csv("data/kaggle_food_dot_com/PP_recipes.csv")
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [4]:
df.shape

(178265, 8)

In [5]:
df.dtypes

id                    int64
i                     int64
name_tokens          object
ingredient_tokens    object
steps_tokens         object
techniques           object
calorie_level         int64
ingredient_ids       object
dtype: object

In [6]:
df["ingredient_tokens"][0]

'[[2911, 1019, 249, 6878], [1353], [6953], [15341, 3261], [2056, 857, 643, 1631, 20480]]'

In [7]:
df["ingredient_ids"][0]

'[389, 7655, 6270, 1527, 3406]'

---
## convert strings into real lists
- some of the fields I want to use look like lists but they're wrapped in strings so I need to listify them with regex

In [8]:
result = []
for element in df["ingredient_tokens"][0].split("]"):
    tokens = re.findall(r"\b\d+", element)
    if len(tokens) > 0:
        tokens = [ int(token) for token in tokens ]
        result.append(tokens)
result

[[2911, 1019, 249, 6878],
 [1353],
 [6953],
 [15341, 3261],
 [2056, 857, 643, 1631, 20480]]

In [9]:
[ int(id) for id in df["ingredient_ids"][0].replace("[", "").replace("]", "").split(", ") ]

[389, 7655, 6270, 1527, 3406]

In [10]:
# confirming that `split` doesn't care if it has anything to split
[ int(id) for id in "[17]".replace("[", "").replace("]", "").split(", ") ]

[17]

In [11]:
def ingr_ids_to_list(id_list):
    return [ int(id) for id in  id_list.replace("[", "").replace("]","").split(", ") ]

df["ingr_ints"] = df["ingredient_ids"].apply(ingr_ids_to_list)

In [12]:
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingr_ints
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [13]:
file_name = "data/generated_data/processed_recipes.parquet"
df.to_parquet(file_name, engine="pyarrow", compression="gzip")

---
## dummify ingredient tokens
- these integers don't have any meaning -- ie. they're still just ids to map to the ingredients
- binarize them so the model just has flags for which ones are present, ie. dummify the tokens

In [14]:
map_df = pd.read_pickle("data/kaggle_food_dot_com/ingr_map.pkl")
map_df.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [15]:
test_df = pd.DataFrame()
test_series = pd.Series([[12, 4308, 5], [3, 29, 12]])
test_ids = [4308, 5, 29, 12]

for id_col in test_ids:
    col = []
    for recipe_row in range(0, test_series.shape[0]):
        flag = 1 if id_col in test_series.iloc[recipe_row] else 0
        col.append(flag)
    # print(col)
    test_df[id_col] = col
    
test_df.head()

# this is not terribly efficient but let's see if that's a problem with this data

Unnamed: 0,4308,5,29,12
0,1,1,0,1
1,0,0,1,1


In [16]:
ingr_ids = map_df["id"].unique()

In [17]:
flags_df = pd.DataFrame()
cols = []
for id_col in ingr_ids:
    col = []
    for recipe_row in range(0, df.shape[0]):
        flag = 1 if id_col in df["ingr_ints"].iloc[recipe_row] else 0
        col.append(flag)
    cols.append(col)

In [18]:
col_series = []
for col in cols:
    col_series.append(pd.Series(col))

In [19]:
flags_df = pd.concat(col_series, axis=1)    
flags_df.shape

(178265, 8023)

In [20]:
file_name = "data/generated_data/ingr_dummies.parquet"
flags_df.to_parquet(file_name, engine="pyarrow", compression="gzip")

---
## KMeans model
- to be clear, this is attempting to cluster recipes based on their ingredients, it's not clustering the ingredients themselves
- I had previously been looking at a food database that had distinct labels for 22 types of foods
- started looking at KMeans clustering with `n_clusters=12`; not quite a random number but also not with any real science behind it

In [21]:
id_flags = flags_df.columns

In [22]:
type(id_flags)

pandas.core.indexes.range.RangeIndex

In [23]:
km = KMeans(n_clusters=12, n_init="auto", random_state=42)

In [24]:
km.fit(flags_df)

In [25]:
km.cluster_centers_

array([[ 2.88291583e-02, -3.52365706e-19,  1.82848784e-04, ...,
        -1.01643954e-18, -8.80914265e-20,  6.16639986e-19],
       [ 2.38307350e-02, -2.27004830e-19,  3.71195249e-04, ...,
         7.42390497e-05, -5.67512075e-20,  2.77826807e-19],
       [ 2.56505576e-02,  2.71050543e-19,  1.85873606e-04, ...,
        -6.36968776e-19,  6.77626358e-20,  5.42101086e-20],
       ...,
       [ 3.96327584e-02,  7.58763721e-05,  1.08756133e-03, ...,
         6.73560600e-18,  3.43048344e-19,  7.58763721e-05],
       [ 3.63754466e-02, -3.11708125e-19,  1.94868464e-04, ...,
        -1.00288701e-18, -7.79270311e-20,  5.21772296e-19],
       [ 1.66722241e-04,  3.28648784e-19, -4.55364912e-18, ...,
        -6.91178885e-19,  8.21621959e-20, -8.13151629e-20]])

In [26]:
len(km.cluster_centers_)

12

In [27]:
len(km.cluster_centers_[0])

8023

In [28]:
len(km.labels_) == df.shape[0]

True

In [29]:
silhouette_score(flags_df, km.labels_)

0.016991328560413268

In [30]:
km.inertia_


1318922.8589656104

In [31]:
labeled_recipes = pd.concat([df["id"], pd.Series(km.labels_)], axis=1)

In [32]:
labeled_recipes.columns

Index(['id', 0], dtype='object')

In [33]:
labeled_recipes.rename(columns={"id": "recipe_id", 0: "cluster"}, inplace=True)
labeled_recipes.columns

Index(['recipe_id', 'cluster'], dtype='object')

In [34]:
labeled_recipes["cluster"].unique()

array([ 8,  4,  9,  3,  1,  0,  5, 10,  6,  7, 11,  2], dtype=int32)

In [35]:
labeled_recipes["cluster"].value_counts()

cluster
9     39538
7     18889
3     18225
0     16407
10    15395
8     14377
5     13932
1     13470
4     11022
11     5998
6      5632
2      5380
Name: count, dtype: int64

---
## add cluster labels to raw recipes

In [36]:
raw_recipes = pd.read_csv("data/kaggle_food_dot_com/RAW_recipes.csv")
raw_recipes.shape

(231637, 12)

In [37]:
raw_recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [38]:
# just checking
set(labeled_recipes["recipe_id"]).issubset(set(raw_recipes["id"]))

True

In [39]:
labeled_recipes = pd.merge(left=labeled_recipes, right=raw_recipes, left_on="recipe_id", right_on="id", how="inner")

In [40]:
labeled_recipes.columns

Index(['recipe_id', 'cluster', 'name', 'id', 'minutes', 'contributor_id',
       'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description',
       'ingredients', 'n_ingredients'],
      dtype='object')

In [41]:
labeled_recipes.drop(columns=["id", "contributor_id", "submitted", "nutrition"], inplace=True)

In [42]:
labeled_recipes.head()

Unnamed: 0,recipe_id,cluster,name,minutes,tags,n_steps,steps,description,ingredients,n_ingredients
0,424415,8,aromatic basmati rice rice cooker,61,"['weeknight', 'time-to-make', 'course', 'main-...",6,"['rinse the rice in a fine strainer , then dra...",from the ultimate rice cooker cookbook. the a...,"['basmati rice', 'water', 'salt', 'cinnamon st...",5
1,146223,4,pumpkin pie a la easy,55,"['60-minutes-or-less', 'time-to-make', 'course...",10,"['preheat oven to 350', 'combine flour , oats ...",this is a pampered chef recipe for their stone...,"['flour', 'oats', 'brown sugar', 'pecans', 'bu...",12
2,312329,8,cheesy tomato soup with potatoes,25,"['30-minutes-or-less', 'time-to-make', 'course...",6,"['pour the broth & water into a large pot', 'a...",after modifying another recipe i came up with ...,"['chicken broth', 'water', 'salt', 'black pepp...",15
3,74301,9,mini tacos,15,"['15-minutes-or-less', 'time-to-make', 'course...",8,"['cook hamburger until browned', 'drain the fa...",these can be a easy appetizer or a light dinne...,"['wonton wrappers', 'hamburger', 'taco seasoni...",8
4,76272,9,rosemary s hanky panky s,20,"['30-minutes-or-less', 'time-to-make', 'course...",5,"['fry ground beef and sausage until browned', ...",my girlfriend rosemary gave me this wonderfull...,"['ground beef', 'ground sausage', 'velveeta ch...",4


In [43]:
labeled_recipes["ingredients"][0]

"['basmati rice', 'water', 'salt', 'cinnamon stick', 'green cardamom pods']"

## more regex for getting data to lists

In [44]:
# for now, we'll assume that ingredients do not have any "-" in them
re.findall(r"\b[\w\s]+", labeled_recipes["ingredients"][0])

['basmati rice', 'water', 'salt', 'cinnamon stick', 'green cardamom pods']

In [45]:
# and we'll assume that tags have no spaces
re.findall(r"\b[\w+-?]+", labeled_recipes["tags"][0])

['weeknight',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 '5-ingredients-or-less',
 'side-dishes',
 'rice',
 'asian',
 'indian',
 'easy',
 'vegan',
 'vegetarian',
 'dietary',
 'pasta-rice-and-grains',
 '4-hours-or-less']

In [46]:
def get_indredients(string_list):
    return re.findall(r"\b[\w\s]+", string_list)

labeled_recipes["ingredients"] = labeled_recipes["ingredients"].apply(get_indredients)

In [47]:
def get_tags(string_list):
    return re.findall(r"\b[\w+-?]+", string_list)

labeled_recipes["tags"] = labeled_recipes["tags"].apply(get_tags)

In [48]:
labeled_recipes["ingredients"][0]

['basmati rice', 'water', 'salt', 'cinnamon stick', 'green cardamom pods']

In [49]:
labeled_recipes[["name", "ingredients", "tags"]].head()

Unnamed: 0,name,ingredients,tags
0,aromatic basmati rice rice cooker,"[basmati rice, water, salt, cinnamon stick, gr...","[weeknight, time-to-make, course, main-ingredi..."
1,pumpkin pie a la easy,"[flour, oats, brown sugar, pecans, butter, egg...","[60-minutes-or-less, time-to-make, course, pre..."
2,cheesy tomato soup with potatoes,"[chicken broth, water, salt, black pepper, oni...","[30-minutes-or-less, time-to-make, course, mai..."
3,mini tacos,"[wonton wrappers, hamburger, taco seasoning, s...","[15-minutes-or-less, time-to-make, course, mai..."
4,rosemary s hanky panky s,"[ground beef, ground sausage, velveeta cheese,...","[30-minutes-or-less, time-to-make, course, pre..."


In [50]:
# trying this again now that I have actual lists
labeled_recipes["tags"].explode().unique()[:20]

array(['weeknight', 'time-to-make', 'course', 'main-ingredient',
       'cuisine', 'preparation', 'occasion', '5-ingredients-or-less',
       'side-dishes', 'rice', 'asian', 'indian', 'easy', 'vegan',
       'vegetarian', 'dietary', 'pasta-rice-and-grains',
       '4-hours-or-less', '60-minutes-or-less', 'for-large-groups'],
      dtype=object)

In [51]:
sm_cluster = labeled_recipes.loc[labeled_recipes["cluster"] == labeled_recipes["cluster"].value_counts().idxmin()]
sm_cluster.shape[0]

5380

In [52]:
sm_cluster.head()

Unnamed: 0,recipe_id,cluster,name,minutes,tags,n_steps,steps,description,ingredients,n_ingredients
40,227025,2,orange yogurt,10,"[15-minutes-or-less, time-to-make, course, mai...",6,['line a sieve or colander with cheesecloth or...,a lovely recipe from the barefoot contessa tha...,"[plain yogurt, raisins, walnuts, pure vanilla ...",9
50,275826,2,baklava latte,5,"[15-minutes-or-less, time-to-make, course, pre...",3,"['in a 10-ounce cup , combine syrups , honey a...",from the cappuccino espresso the book of bever...,"[praline syrup, hazelnut syrup, maple syrup, l...",9
67,305393,2,ancient sweet with honey,65,"[time-to-make, course, main-ingredient, cuisin...",9,"['grind the sesame', 'grease a cake pan and sp...",this is an ancient greek recipe taken from the...,"[flour, almonds, walnuts, honey, eggs, sesame]",6
86,92023,2,triple decker honey peanut butter brownies,45,"[60-minutes-or-less, time-to-make, course, pre...",15,['oven@ 350 line a 7x11 inch pan with aluminum...,the name is a mouth-ful but this is an easy re...,"[peanut butter, honey, brownie mix, vegetable ...",8
162,23469,2,balsamic chicken and veggies,30,"[30-minutes-or-less, time-to-make, course, mai...",11,"['stir together salad dressing , vinegar , hon...",excellant italian chicken dish! nice to sprink...,"[italian salad dressing, balsamic vinegar, hon...",9


---
## predict cluster for a new recipe

### fetch some recipes from the Edamam API

In [53]:
# fetch api key from secrets.json file
with open("secrets.json", "rb") as f:
    key_file = json.loads(f.read())

app_id = key_file["app_id"]
app_key = key_file["key"]

In [86]:
ingr_test = ["tomato sauce", "mushrooms"]
num_ingr_test = (3, 12)

In [87]:
res = get_recipes.get_recipes(ingr=ingr_test, num_ingr=num_ingr_test, app_id=app_id, app_key=app_key)

200


In [88]:
res.columns

Index(['title', 'link', 'servings', 'cuisine', 'diet_labels', 'healh_labels',
       'cautions', 'ingredient_lines', 'ingredient_details',
       'ingredient_labels'],
      dtype='object')

In [89]:
res[["title", "ingredient_lines", "ingredient_labels"]].head()

Unnamed: 0,title,ingredient_lines,ingredient_labels
0,Dinner Tonight: Spaghetti alla Boscaiola (Spag...,"[1 pound spaghetti, 3 tablespoons olive oil, p...","[spaghetti, olive oil, plum tomatoes, garlic, ..."
1,Chicken with Tomatoes and Mushrooms,"[4 boneless, skinless chicken breast halves, a...","[boneless, skinless chicken breast, Coarse sal..."
2,Zucchini Cannelloni with Cashew sauce,"[2 pieces zucchini, 400 grams mushrooms, 1 pie...","[zucchini, mushrooms, onion, garlic, cashew nu..."
3,Tomato and Sausage Tortellini,[1 (19 ounce) package frozen cheese tortellini...,"[cheese tortellini, olive oil, black pepper, k..."
4,Mushrooms paprikash,"[½ tsp caraway seed, 1 tbsp olive oil, 1 onion...","[caraway seed, olive oil, onion, mushroom, gre..."


### map recipe ingredients to ingredient codes
- uses string matching on the ingredients from a recipe to map them the ingredient ids (tokens) in the food.com data
- some ingredients appear multiple times in that mapping table, with different variations, see the cell below for "lettuce"
- in most of the cases I saw, where an ingredients appeared multiple times, all the instances had the same id code in map_df

In [108]:
# list of ingredients for first recipe returned from the API
res["ingredient_labels"][0]

['spaghetti',
 'olive oil',
 'plum tomatoes',
 'garlic',
 'mushrooms',
 'parsley',
 'Salt',
 'pepper']

In [109]:
# reminder of what map_df looks like
map_df.head(3)

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308


In [92]:
map_df.loc[map_df["replaced"] == "lettuce"]["id"][0]

4308

In [110]:
# lots of rows that represent "lettuce", these all have the same id
map_df.loc[map_df["raw_ingr"].str.find("lettuce") != -1].head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [95]:
codes = []

for label in res["ingredient_labels"][0]:
# for label in ["plum tomatoes"]:
    replaced = map_df.loc[map_df["replaced"] == label.lower()]
    raw = map_df.loc[map_df["raw_ingr"].str.find(label.lower()) != -1]
    if replaced.shape[0] > 0:
        print(f"found {label} in replaced")
        code = replaced["id"].value_counts().idxmax()
        
    elif raw.shape[0] > 0:
        print(f"found {label} in raw")
        code = raw["id"].value_counts().idxmax()
        
    else:
        print(f"couldn't find {label}")
        code=-1
    
    codes.append(code)

found spaghetti in raw
found olive oil in replaced
found plum tomatoes in raw
found garlic in replaced
found mushrooms in raw
found parsley in replaced
found Salt in replaced
found pepper in replaced


In [97]:
codes 

[6711, 5006, 5519, 3184, 7856, 5185, 6270, 5319]

In [98]:
len(ingr_ids)

8023

In [112]:
# searches the "replaced" & "raw" columns in map_df & returns the code that most-frequently matches a 
#  given ingredient or -1 if there are no matches in either column; returns a list of codes for each
#  recipes, nested in a list for for the whole batch
test_batch = modeling.map_recipe_ingredients(res["ingredient_labels"], map_df)

In [113]:
test_batch

[[6711, 5006, 5519, 3184, 7856, 5185, 6270, 5319],
 [-1, 1573, 3532, 5006, 7856, 3184, 6819, 2320],
 [8021, 7856, 5010, 3184, 1101, 5519, -1, 3217],
 [1186, 5006, 590, 4096, 7979, 7856, 6324, 2131, 7233, 6754, 1910, 5180],
 [1078,
  5006,
  5010,
  4863,
  3440,
  6581,
  7213,
  7545,
  1909,
  5185,
  5205,
  5648,
  3198],
 [5006, 5010, -1, 4096, 3184, 6777, 7229, 2347, 5105, 5177, 7367, 5185],
 [2339, -1, 5006, 4096, 590, 1093, 7979, 3184, -1, 1747, 4836, 7276, 4987],
 [7367, -1, 3184, 2922, 5010, -1, 5724, 3668, 5180, 6270, 5319, 4145],
 [1573, 3532, 8003, -1, 6510, 7719, 7233, 3888, 1257, 6654, 5177],
 [-1, 7719, 6426, -1, 2882, 6270, 3532, 3294, 5205, 7655, 7229, 3184],
 [-1, 840, 254, 6447, 382, 5205, 4836],
 [-1, 1573, 3532, 5006, 7856, 3184, -1, 2320],
 [840, 7856, 6426, 7168, 7258, 7229, 84, 6696, 6444, 7550, 2416, 6270, 5319],
 [-1, 1043, 5351, 5177, 6933, -1, 4836, 6270, 590],
 [6754, 1908, 7258, 4836, 5731, 6085, 2499, 5180, 5205, 7233, 1786, 7892],
 [2339, 7655, -1, 3184

### dummify ingredients & get prediction

In [114]:
# dummifies the recipes to pass to the model
def flag_ingredients(ingreds: list[list[int]], map_df: pd.DataFrame):
    flagged_recipes = []
    # iterate through the list of recipe ingredient ids
    for ing_list in ingreds:
        # iterate through the list of unique ids from mapper
        recipe_row = []
        for id_col in map_df["id"].unique():
            flag = 1 if id_col in ing_list else 0
            recipe_row.append(flag)
        flagged_recipes.append(recipe_row)
    
    return pd.DataFrame(flagged_recipes)

In [115]:
batch_to_predict = flag_ingredients(test_batch, map_df)

In [116]:
# rows = numbeer or recipes, columns = 8023 ingredients
batch_to_predict.shape

(20, 8023)

In [117]:
km.predict(batch_to_predict)

array([ 1, 10,  7, 10, 10, 10, 10,  1,  9,  8,  3, 10,  1,  8,  5,  9,  9,
        1, 10,  9], dtype=int32)

### observations about the prediction
- looking at the most frequent cluster assigned (cluster 10), check out recipes from the original food.com data that were placed in the same cluster
- check out the recipes titles from the Edamam API that were assigned to cluster 10 and the ones that were not
- check out the ingredients in the API recipes that were assigned to cluster 10 and those that were assigned to other clusters

In [124]:
# looks like cluster 10 appears most often in that batch
cluster_10 = labeled_recipes.loc[labeled_recipes["cluster"] == 10]

In [125]:
cluster_10[["name", "ingredients"]].head(10)

Unnamed: 0,name,ingredients
13,sommer s chicken spaghetti,"[chicken breast, spaghetti sauce, garlic, oliv..."
34,italian style spinach,"[bacon, olive oil, onion, garlic, spinach leav..."
41,onion soup with parmigiano and cracked pepper,"[butter, olive oil, onions, coarse salt, chick..."
42,green and gold peas with pasta,"[orecchiette, bacon, onion, frozen peas, garba..."
44,linguine with wild mushroom alfredo,"[linguine, olive oil, dry white wine, button m..."
70,roast beetroot and garlic dip,"[beetroot, garlic, ground coriander, ground cu..."
90,bbq bat wings,"[ketchup, balsamic vinegar, dark brown sugar, ..."
101,tuscan lemon chicken skillet or grill,"[whole chickens, kosher salt, olive oil, lemon..."
105,easy chilli beans,"[olive oil, onion, red kidney beans, chopped t..."
113,roasted cod with prosciutto cherry tomatoes a...,"[cherry tomatoes, pitted black olives, capers,..."


In [137]:
# np.where returns a tuple but I only want the first element
res_10 = np.where(km.predict(batch_to_predict) == 10)[0]
res_10

array([ 1,  3,  4,  5,  6, 11, 18])

In [143]:
# show the names of the recipes that the model put into cluster 10
[ title for index, title in enumerate(res["title"]) if index in res_10 ]

['Chicken with Tomatoes and Mushrooms',
 'Tomato and Sausage Tortellini',
 'Mushrooms paprikash',
 'Pasta With Lentils and Mushrooms',
 'Rigatoni with Mushroom Ragu, Fresh Mozzarella, and Truffle Recipe',
 'Chicken with Tomatoes and Mushrooms',
 'Roast Pork Loin With Mushrooms and Tomatoes']

In [144]:
# show the names of the recipes that the model DID NOT put into cluster 10
[ title for index, title in enumerate(res["title"]) if index not in res_10 ]

['Dinner Tonight: Spaghetti alla Boscaiola (Spaghetti with Tomato Sauce and Mushrooms) Recipe',
 'Zucchini Cannelloni with Cashew sauce',
 'Pasta Frittata With Mushrooms',
 'Beef and Tomato Stroganoff',
 'Baked Tomato, Mushroom & Goat Cheese Pasta',
 'Mushroom & aubergine pizza pie',
 'Lentil Mushroom Stew',
 'Grilled Stuffed Portobello Mushrooms',
 'Pesto Lasagna with Spinach and Mushrooms',
 'Roman-Style Spaghetti Alla Carrettiera (Tomato, Tuna, and Mushroom Pasta) Recipe',
 'Sichuan-Style Tofu with Mushrooms',
 'Baked Rigatoni With Mushrooms And Prosciutto',
 'Rigatoni with Chunky Vegetable Sauce']

In [145]:
ingreds_10 = [ ingreds for index, ingreds in enumerate(res["ingredient_labels"]) if index in res_10]
ingreds_other = [ ingreds for index, ingreds in enumerate(res["ingredient_labels"]) if index not in res_10]

In [150]:
ingreds_10_dict = {}

for l in ingreds_10:
    for ing in l:
        if ing in ingreds_10_dict:
            ingreds_10_dict[ing] += 1
        else:
            ingreds_10_dict[ing] = 1
            
ingreds_10_dict.keys()

dict_keys(['boneless, skinless chicken breast', 'Coarse salt', 'ground pepper', 'olive oil', 'mushrooms', 'garlic', 'stewed tomatoes', 'dried oregano', 'cheese tortellini', 'black pepper', 'kosher salt', 'yellow onion', 'chicken sausage', 'diced tomatoes', 'tomato sauce', 'spinach', 'cream cheese', 'parmesan cheese', 'caraway seed', 'onion', 'mushroom', 'green pepper', 'smoked paprika', 'tomato', 'vegetable', 'cream', 'parsley', 'pasta', 'potato', 'garlic bread', 'crimini mushrooms', 'lentils', 'tomato paste', 'red pepper flakes', 'orecchiette', 'Parmesan', 'unsalted butter', 'dried porcini mushrooms', 'Portobello mushrooms', 'Kosher salt', 'carrot', 'tomato basil sauce', 'rigatoni', 'mozzarella', 'Truffle', 'oil', 'can stewed tomatoes', 'plum tomatoes', 'button mushrooms', 'boneless pork loin', 'cumin', 'coriander'])

In [151]:
ingreds_other_dict = {}

for l in ingreds_other:
    for ing in l:
        if ing in ingreds_other_dict:
            ingreds_other_dict[ing] += 1
        else:
            ingreds_other_dict[ing] = 1
            
ingreds_other_dict.keys()

dict_keys(['spaghetti', 'olive oil', 'plum tomatoes', 'garlic', 'mushrooms', 'parsley', 'Salt', 'pepper', 'zucchini', 'onion', 'cashew nuts', 'plum tomato', 'nutritional yeasts', 'garlic powder', 'unsalted butter', 'extra-virgin olive oil', 'white mushrooms', 'leftover spaghetti', 'prosciutto', 'heavy cream', 'Parmesan cheese', 'eggs', 'Coarse salt', 'ground pepper', 'wide egg noodles', 'sirloin steak', 'button mushrooms', 'tomato sauce', 'Potatoes', 'low-sodium chicken broth', 'sour cream', 'Parmesan', 'cherry or grape tomatoes', 'shallots', 'fresh sage', 'salt', 'goat cheese', 'pasta', 'water', 'tomato paste', 'ciabatta bread mix', 'butter', 'aubergine', 'shiitake mushroom', 'basil', 'pasta sauce', 'mozzarella', 'thyme', 'flour', 'almond milk', 'soy sauce', 'sherry', 'vegetable broth', 'green lentils', 'portobello mushrooms', 'canola oil', 'pesto sauce', 'sun-dried tomatoes', 'panko bread crumbs', 'mozzarella cheese', 'black pepper', 'spinach', 'cremini mushrooms', 'pesto', 'provolon

- not sure this cluster assignment is particularly reasonable or unreasonable, more to do on this