# Build first model(s)
- explore Kaggle's "pre-processed" data to try it with some clustering models
- see [eda_recipe_database](eda_recipe_database.ipynb) notebook for initial exploration of this data
- I want to build a model that learns which ingredients tend to occur together
- then use that model to predict an ingredient, given the presence of other ingredients

## what's in thie notebook:
- convert strings into real lists
- dummify ingredient tokens
- KMeans model
- add cluster labels to raw recipes
- more regex for getting data to lists
- predict cluster for a new recipe

In [1]:
import re
import json
import importlib

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import get_recipes, modeling

In [2]:
importlib.reload(get_recipes);

---
## pre-processed recipes
- starting with the tokenized data from kaggle

In [3]:
df = pd.read_csv("data/kaggle_food_dot_com/PP_recipes.csv")
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [4]:
df.shape

(178265, 8)

In [5]:
df.dtypes

id                    int64
i                     int64
name_tokens          object
ingredient_tokens    object
steps_tokens         object
techniques           object
calorie_level         int64
ingredient_ids       object
dtype: object

In [6]:
df["ingredient_tokens"][0]

'[[2911, 1019, 249, 6878], [1353], [6953], [15341, 3261], [2056, 857, 643, 1631, 20480]]'

In [7]:
df["ingredient_ids"][0]

'[389, 7655, 6270, 1527, 3406]'

---
## convert strings into real lists
- some of the fields I want to use look like lists but they're wrapped in strings so I need to listify them with regex

In [8]:
result = []
for element in df["ingredient_tokens"][0].split("]"):
    tokens = re.findall(r"\b\d+", element)
    if len(tokens) > 0:
        tokens = [ int(token) for token in tokens ]
        result.append(tokens)
result

[[2911, 1019, 249, 6878],
 [1353],
 [6953],
 [15341, 3261],
 [2056, 857, 643, 1631, 20480]]

In [9]:
[ int(id) for id in df["ingredient_ids"][0].replace("[", "").replace("]", "").split(", ") ]

[389, 7655, 6270, 1527, 3406]

In [10]:
# confirming that `split` doesn't care if it has anything to split
[ int(id) for id in "[17]".replace("[", "").replace("]", "").split(", ") ]

[17]

In [11]:
def ingr_ids_to_list(id_list):
    return [ int(id) for id in  id_list.replace("[", "").replace("]","").split(", ") ]

df["ingr_ints"] = df["ingredient_ids"].apply(ingr_ids_to_list)

In [12]:
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingr_ints
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [13]:
file_name = "data/generated_data/processed_recipes.parquet"
df.to_parquet(file_name, engine="pyarrow", compression="gzip")

---
## dummify ingredient tokens
- these integers don't have any meaning -- ie. they're still just ids to map to the ingredients
- binarize them so the model just has flags for which ones are present, ie. dummify the tokens

In [14]:
map_df = pd.read_pickle("data/kaggle_food_dot_com/ingr_map.pkl")
map_df.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [15]:
test_df = pd.DataFrame()
test_series = pd.Series([[12, 4308, 5], [3, 29, 12]])
test_ids = [4308, 5, 29, 12]

for id_col in test_ids:
    col = []
    for recipe_row in range(0, test_series.shape[0]):
        flag = 1 if id_col in test_series.iloc[recipe_row] else 0
        col.append(flag)
    # print(col)
    test_df[id_col] = col
    
test_df.head()

# this is not terribly efficient but let's see if that's a problem with this data

Unnamed: 0,4308,5,29,12
0,1,1,0,1
1,0,0,1,1


In [16]:
ingr_ids = map_df["id"].unique()

In [17]:
flags_df = pd.DataFrame()
cols = []
for id_col in ingr_ids:
    col = []
    for recipe_row in range(0, df.shape[0]):
        flag = 1 if id_col in df["ingr_ints"].iloc[recipe_row] else 0
        col.append(flag)
    cols.append(col)

In [None]:
col_series = []
for col in cols:
    col_series.append(pd.Series(col))

In [None]:
flags_df = pd.concat(col_series, axis=1)    
flags_df.shape

In [None]:
file_name = "data/generated_data/ingr_dummies.parquet"
flags_df.to_parquet(file_name, engine="pyarrow", compression="gzip")

---
## KMeans model
- to be clear, this is attempting to cluster recipes based on their ingredients, it's not clustering the ingredients themselves
- I had previously been looking at a food database that had distinct labels for 22 types of foods
- started looking at KMeans clustering with `n_clusters=12`; not quite a random number but also not with any real science behind it

In [None]:
id_flags = flags_df.columns

In [None]:
type(id_flags)

In [None]:
km = KMeans(n_clusters=12, n_init="auto", random_state=42)

In [None]:
km.fit(flags_df)

In [None]:
km.cluster_centers_

In [None]:
len(km.cluster_centers_)

In [None]:
len(km.cluster_centers_[0])

In [None]:
len(km.labels_) == df.shape[0]

In [None]:
silhouette_score(flags_df, km.labels_)

In [None]:
km.inertia_


In [None]:
labeled_recipes = pd.concat([df["id"], pd.Series(km.labels_)], axis=1)

In [None]:
labeled_recipes.columns

In [None]:
labeled_recipes.rename(columns={"id": "recipe_id", 0: "cluster"}, inplace=True)
labeled_recipes.columns

In [None]:
labeled_recipes["cluster"].unique()

In [None]:
labeled_recipes["cluster"].value_counts()

---
## add cluster labels to raw recipes

In [None]:
raw_recipes = pd.read_csv("data/kaggle_food_dot_com/RAW_recipes.csv")
raw_recipes.shape

In [None]:
raw_recipes.columns

In [None]:
# just checking
set(labeled_recipes["recipe_id"]).issubset(set(raw_recipes["id"]))

In [None]:
labeled_recipes = pd.merge(left=labeled_recipes, right=raw_recipes, left_on="recipe_id", right_on="id", how="inner")

In [None]:
labeled_recipes.columns

In [None]:
labeled_recipes.drop(columns=["id", "contributor_id", "submitted", "nutrition"], inplace=True)

In [None]:
labeled_recipes.head()

In [None]:
labeled_recipes["ingredients"][0]

## more regex for getting data to lists

In [None]:
# for now, we'll assume that ingredients do not have any "-" in them
re.findall(r"\b[\w\s]+", labeled_recipes["ingredients"][0])

In [None]:
# and we'll assume that tags have no spaces
re.findall(r"\b[\w+-?]+", labeled_recipes["tags"][0])

In [None]:
def get_indredients(string_list):
    return re.findall(r"\b[\w\s]+", string_list)

labeled_recipes["ingredients"] = labeled_recipes["ingredients"].apply(get_indredients)

In [None]:
def get_tags(string_list):
    return re.findall(r"\b[\w+-?]+", string_list)

labeled_recipes["tags"] = labeled_recipes["tags"].apply(get_tags)

In [None]:
labeled_recipes["ingredients"][0]

In [None]:
labeled_recipes[["name", "ingredients", "tags"]].head()

In [None]:
# trying this again now that I have actual lists
labeled_recipes["tags"].explode().unique()[:20]

In [None]:
sm_cluster = labeled_recipes.loc[labeled_recipes["cluster"] == labeled_recipes["cluster"].value_counts().idxmin()]
sm_cluster.shape[0]

In [None]:
sm_cluster.head()

---
## predict cluster for a new recipe
- fetch some recipes from the Edamam API
- map the ingredients from a recipe to the ingredient ids (tokens) in the food.com data
- get prediction for a cluster from my kmeans model

In [None]:
# fetch api key from secrets.json file
with open("secrets.json", "rb") as f:
    key_file = json.loads(f.read())

app_id = key_file["app_id"]
app_key = key_file["key"]

In [None]:
ingr_test = ["tomato sauce", "tomato sauce", "mushrooms"]
num_ingr_test = (3, 12)

In [None]:
res = get_recipes.get_recipes(ingr=ingr_test, num_ingr=num_ingr_test, app_id=app_id, app_key=app_key)

In [None]:
res.columns

In [None]:
res[["title", "ingredient_lines", "ingredient_labels"]].head()

In [None]:
res["ingredient_labels"][0]

In [None]:
map_df.head(3)

In [None]:
map_df.loc[map_df["replaced"] == "lettuce"]["id"][0]

In [None]:
# not sure I was expecting this to work!
# map_df.loc[map_df["raw_ingr"].str.find("lettuce") != -1]

In [None]:
codes = []

for label in res["ingredient_labels"][0]:
# for label in ["plum tomatoes"]:
    replaced = map_df.loc[map_df["replaced"] == label.lower()]
    raw = map_df.loc[map_df["raw_ingr"].str.find(label.lower()) != -1]
    if replaced.shape[0] > 0:
        print(f"found {label} in replaced")
        code = replaced["id"].value_counts().idxmax()
        
    elif raw.shape[0] > 0:
        print(f"found {label} in raw")
        code = raw["id"].value_counts().idxmax()
        
    else:
        print(f"couldn't find {label}")
        code=-1
    
    codes.append(code)

In [None]:
# multiple ingredients
codes

In [None]:
# tomatoes
codes 

In [None]:
# plum tomatoes
codes

In [None]:
len(ingr_ids)

In [None]:
test_batch = modeling.map_recipe_ingredients(res["ingredient_labels"], map_df)

In [None]:
test_batch

In [None]:
def flag_ingredients(ingreds: list[list[int]], map_df: pd.DataFrame):
    flagged_recipes = []
    # iterate through the list of recipe ingredient ids
    for ing_list in ingreds:
        # iterate through the list of unique ids from mapper
        recipe_row = []
        for id_col in map_df["id"].unique():
            flag = 1 if id_col in ing_list else 0
            recipe_row.append(flag)
        flagged_recipes.append(recipe_row)
    
    return pd.DataFrame(flagged_recipes)

In [None]:
batch_to_predict = flag_ingredients(test_batch, map_df)

In [None]:
batch_to_predict.shape

In [None]:
km.predict(batch_to_predict)

In [None]:
recipe_to_predict = []
for id_col in ingr_ids:
    flag = 1 if id_col in codes else 0
    recipe_to_predict.append(flag)

In [None]:
len(recipe_to_predict)

In [None]:
sum(recipe_to_predict) == len(codes)

In [None]:
pred = pd.DataFrame(recipe_to_predict).T
pred.shape

In [None]:
pred.columns

In [None]:
km.predict(pred)[0]