In [1]:
import json as json
import numpy as np
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open("recipes_data.json", "r") as read_file:
    data = json.load(read_file)

In [2]:
len(data)

994

In [6]:
df = pd.DataFrame(list(data.values()))
list_columns = ["images", "instructions", "tools", "videos","ingredients", "components"]
notna = df.drop(columns=list_columns + ["trimmed", "types"]).notna().mean()
notna = np.round(notna * 100, 2).to_frame()
notna

Unnamed: 0,0
displayName,100.0
description,8.85
canonicalName,87.42
prepTimeMinutes,45.17
cookTimeMinutes,40.14
totalTimeMinutes,74.04
cookingMethod,0.0
difficultyLevel,40.64
cuisines,43.96
meals,18.41


In [14]:
print(notna.to_latex(caption="Dataproperties of Recipes", float_format="%.2f"))

\begin{table}
\caption{Dataproperties of Recipes}
\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
displayName & 100.00 \\
description & 8.85 \\
canonicalName & 87.42 \\
prepTimeMinutes & 45.17 \\
cookTimeMinutes & 40.14 \\
totalTimeMinutes & 74.04 \\
cookingMethod & 0.00 \\
difficultyLevel & 40.64 \\
cuisines & 43.96 \\
meals & 18.41 \\
courses & 20.72 \\
occasions & 20.02 \\
diets & 28.97 \\
difficulty & 40.64 \\
servings & 92.76 \\
nutrition & 25.86 \\
\bottomrule
\end{tabular}
\end{table}



In [9]:
list_info = df[
    [
        "instructions",
        "ingredients",
        "tools",
        "images",
        "videos",
    ]
].apply(np.vectorize(len)).describe().loc[["min", "mean","std", "max"]]

list_info.columns = [i.title() for i in list_info.columns]
list_info

Unnamed: 0,Instructions,Ingredients,Tools,Images,Videos
min,1.0,1.0,0.0,0.0,0.0
mean,5.457746,10.302817,0.605634,1.003018,0.353119
std,4.069973,5.989572,2.042698,0.212857,0.592888
max,35.0,55.0,15.0,4.0,5.0


In [13]:
print(list_info.T.to_latex(caption="Amount of Recipy data", float_format="%.2f"))

\begin{table}
\caption{Amount of Recipy data}
\begin{tabular}{lrrrr}
\toprule
 & min & mean & std & max \\
\midrule
Instructions & 1.00 & 5.46 & 4.07 & 35.00 \\
Ingredients & 1.00 & 10.30 & 5.99 & 55.00 \\
Tools & 0.00 & 0.61 & 2.04 & 15.00 \\
Images & 0.00 & 1.00 & 0.21 & 4.00 \\
Videos & 0.00 & 0.35 & 0.59 & 5.00 \\
\bottomrule
\end{tabular}
\end{table}



In [11]:
all_tools = [a for i in data.values() for a in i['tools']]

t_keys = all_tools[0].keys()
missing_t = lambda x: np.mean([a[x] is None for a in all_tools])
{i:missing_t(i) for i in t_keys}

{'displayName': 0.0, 'images': 0.0}

In [60]:
all_ingredients = [a for i in data.values() for a in i['ingredients']]
dfi = pd.DataFrame(all_ingredients).drop(columns=["preparation","productOverride", "componentIndex", "asinOverride", "brand", "staple"])
dfi["images"] = dfi.images.apply(len).astype(int)
dfi.notna().mean()

displayText     1.000000
ingredient      0.790060
ingredientId    0.699639
quantity        1.000000
unit            1.000000
images          1.000000
dtype: float64

In [80]:
dfi.describe(include="all")

Unnamed: 0,displayText,ingredient,ingredientId,quantity,unit,images
count,10241,8091,7165,10241.0,10241,10241.0
unique,7600,938,257,,42,
top,1 teaspoon salt,salt,type_02047_00,,COUNT,
freq,50,724,724,,2731,
mean,,,,2.30437,,0.0
std,,,,9.556648,,0.0
min,,,,0.12,,0.0
25%,,,,1.0,,0.0
50%,,,,1.0,,0.0
75%,,,,2.0,,0.0


In [75]:
all_instructions = [a for i in data.values() for a in i["instructions"]]

ins = pd.DataFrame(all_instructions).drop(
    columns=[
        "componentStepNumber",
        "stepIngredients",
        "stepImages",
        "stepTips",
        "stepSpecificVideos",
        "stepVideoTimestamp",
        "stepVideoEndTimestamp",
        "componentIndex",
        "stepDurationSeconds",
        "stepPrompts"
    ]
)
ins.notna().mean()

stepNumber    1.000000
stepTitle     0.099539
stepText      1.000000
dtype: float64

In [79]:
ins.describe(include="all")

Unnamed: 0,stepNumber,stepTitle,stepText
count,5425.0,540,5425
unique,,448,5146
top,,Step 3,Well done! Want to save this recipe to your Si...
freq,,15,89
mean,4.744885,,
std,4.149913,,
min,1.0,,
25%,2.0,,
50%,4.0,,
75%,6.0,,


In [77]:
tools = pd.DataFrame([a for i in data.values() for a in i["tools"]])
tools.describe()

Unnamed: 0,displayName,images
count,602,602
unique,326,1
top,cutting board,[]
freq,20,602


In [15]:
from data_formats import *
import json
from tqdm import tqdm

with open("data_with_embeddings.json", "r") as f:

    data = json.load(f)

data = [Recipe(**d) for d in data]

In [35]:
example = data[0]
img = example.images[0].get_image().resize((224,224))
img.save("example/example_img.jpg")
img

In [34]:
print("\n".join([i.ingredient.replace(":","") for i in example.ingredients])
)

red pepper flakes
parmesan cheese
Cheese, mozzarella, part skim milk
mozzarella cheese
marinara sauce
chicken breast
Bread crumbs, dry, grated, plain
Water, tap, drinking
egg
ground pepper
salt
flour
oil


In [33]:
print("\n".join([i.stepTitle.replace(":","") for i in example.instructions])
)

Heat the oven and prepare for frying
Set up a breading station
Pound the chicken
Heat the oil
Bread the chicken
Fry the chicken
Cover with sauce and cheese and bake
