# Instructions
This notebook downloads table data from [King Arthur](https://www.kingarthurbaking.com/learn/ingredient-weight-chart). The data is processed in a similar way to USDA notebook and the USDA endpoint is also queried to find the fdcId for each ingredient. The data is then saved to a json file.

### Parameters

In [57]:
# URL of the HTML page
url = "https://www.kingarthurbaking.com/learn/ingredient-weight-chart"

In [58]:
import pandas as pd
import requests
import time
import os

### Download King Arthur Data

In [62]:
# Send a GET request to the URL and get the HTML content
response = requests.get(url)
html_content = response.text

# Read the HTML table into a pandas DataFrame
dfs = pd.read_html(html_content)

# Extract the table you want (assuming it's the first table on the page)
df = dfs[0]
df['fdcId'] = pd.Series()
df = df.drop(columns=["Ounces"])
df = df.rename(columns={"Ingredient": "description", "Grams": "gramWeight"})
df = df[['fdcId', 'description', 'gramWeight', 'Volume']]

  dfs = pd.read_html(html_content)


### Cleanup data

In [63]:
def get_value_from_volume(volume: str) -> float:
    volume_parts = volume.split(' ')
    return pd.eval('+'.join(volume_parts[0:len(volume_parts)-2]) if len(volume_parts) > 2 else volume_parts[0])

def get_uom_from_volume(volume: str) -> float:
    volume_parts = volume.split(' ')
    return volume_parts[-1] if volume_parts[-1] != 'large' else "each"

df['Volume'] = df['Volume'].str.replace(r" {0,1}\(.*\)|\u00AD|\u00AD","", regex=True)
df['description'] = df['description'].str.replace(r" {0,1}\(.*\)|\u00AD|\u00AD","", regex=True)
df['foodPortions'] = df.apply(lambda x: [{'gramWeight': x['gramWeight'].split(' to ')[0], 'value': get_value_from_volume(x["Volume"]), 'uom': get_uom_from_volume(x["Volume"])}], axis=1)

df = df.drop(columns=["gramWeight", "Volume"])
df = df.drop_duplicates(subset='description', keep="last")

df.head(100)

Unnamed: 0,fdcId,description,foodPortions
0,,'00' Pizza Flour,"[{'gramWeight': '116', 'value': 1, 'uom': 'cup'}]"
1,,Agave syrup,"[{'gramWeight': '84', 'value': 0.25, 'uom': 'c..."
2,,All-Purpose Flour,"[{'gramWeight': '120', 'value': 1, 'uom': 'cup'}]"
3,,Almond Flour,"[{'gramWeight': '96', 'value': 1, 'uom': 'cup'}]"
4,,Almond meal,"[{'gramWeight': '84', 'value': 1, 'uom': 'cup'}]"
...,...,...,...
117,,Durum Flour,"[{'gramWeight': '124', 'value': 1, 'uom': 'cup'}]"
118,,Easy Roll Dough Improver,"[{'gramWeight': '18', 'value': 2, 'uom': 'tabl..."
119,,Egg,"[{'gramWeight': '50', 'value': 1, 'uom': 'each'}]"
120,,Egg white,"[{'gramWeight': '35', 'value': 1, 'uom': 'each'}]"


### Cleanup "bad" data

In [64]:
items_to_remove = ["Barley", "Coconut", "Corn", "Cranberries", "Flaxseed", "Quinoa", "Raisins", "Water", "Almonds"]
df = df[~df["description"].isin(items_to_remove)]
df['search_description'] = df['description']
items_to_replace = {"Almonds": "Almond Nuts", 
                    "Brown rice": "Brown rice raw",
                    "Buckwheat": "Buckwheat grain",
                    "Egg": "Whole egg",
                    "Egg yolk": "Egg yolk large",
                    "Milk": "Whole milk vitamin",
                    "Oats": "Rolled oats",
                    "Olives": "Green olives",
                    "Peanuts": "Peanuts raw",
                    "Rice": "White rice grain",
                    "Salt": "Table salt",
                    "Butter": "Butter stick"
                    }
df['search_description'] = df['search_description'].replace(items_to_replace)

df.head(10)

Unnamed: 0,fdcId,description,foodPortions,search_description
0,,'00' Pizza Flour,"[{'gramWeight': '116', 'value': 1, 'uom': 'cup'}]",'00' Pizza Flour
1,,Agave syrup,"[{'gramWeight': '84', 'value': 0.25, 'uom': 'c...",Agave syrup
2,,All-Purpose Flour,"[{'gramWeight': '120', 'value': 1, 'uom': 'cup'}]",All-Purpose Flour
3,,Almond Flour,"[{'gramWeight': '96', 'value': 1, 'uom': 'cup'}]",Almond Flour
4,,Almond meal,"[{'gramWeight': '84', 'value': 1, 'uom': 'cup'}]",Almond meal
5,,Almond paste,"[{'gramWeight': '259', 'value': 1, 'uom': 'cup'}]",Almond paste
8,,"Almonds, whole","[{'gramWeight': '142', 'value': 1, 'uom': 'cup'}]","Almonds, whole"
9,,Amaranth flour,"[{'gramWeight': '103', 'value': 1, 'uom': 'cup'}]",Amaranth flour
10,,Apple juice concentrate,"[{'gramWeight': '70', 'value': 0.25, 'uom': 'c...",Apple juice concentrate
12,,Apples,"[{'gramWeight': '113', 'value': 1, 'uom': 'cup'}]",Apples


### Get data from USDA

In [65]:
usda_api_key = os.environ['USDA_KEY']
df['fdcDescription'] = pd.Series()

for index, row in df.iterrows():
    description = row['search_description']
    
    url = f"https://api.nal.usda.gov/fdc/v1/foods/search?query={description}&pageSize=1&requireAllWords=true&dataType=Foundation&api_key={usda_api_key}"
    response = requests.get(url)
    data = response.json()

    if 'foods' in data and len(data['foods']) > 0:
        df.at[index, 'fdcId'] = data['foods'][0]['fdcId']
        df.at[index, 'fdcDescription'] = data['foods'][0]['description']
    
    time.sleep(.5)

df.head(20)

Unnamed: 0,fdcId,description,foodPortions,search_description,fdcDescription
0,,'00' Pizza Flour,"[{'gramWeight': '116', 'value': 1, 'uom': 'cup'}]",'00' Pizza Flour,
1,,Agave syrup,"[{'gramWeight': '84', 'value': 0.25, 'uom': 'c...",Agave syrup,
2,789890.0,All-Purpose Flour,"[{'gramWeight': '120', 'value': 1, 'uom': 'cup'}]",All-Purpose Flour,"Flour, wheat, all-purpose, enriched, bleached"
3,2261420.0,Almond Flour,"[{'gramWeight': '96', 'value': 1, 'uom': 'cup'}]",Almond Flour,"Flour, almond"
4,,Almond meal,"[{'gramWeight': '84', 'value': 1, 'uom': 'cup'}]",Almond meal,
5,,Almond paste,"[{'gramWeight': '259', 'value': 1, 'uom': 'cup'}]",Almond paste,
8,2346393.0,"Almonds, whole","[{'gramWeight': '142', 'value': 1, 'uom': 'cup'}]","Almonds, whole","Nuts, almonds, whole, raw"
9,2512371.0,Amaranth flour,"[{'gramWeight': '103', 'value': 1, 'uom': 'cup'}]",Amaranth flour,"Flour, amaranth"
10,2003590.0,Apple juice concentrate,"[{'gramWeight': '70', 'value': 0.25, 'uom': 'c...",Apple juice concentrate,"Apple juice, with added vitamin C, from concen..."
12,1750340.0,Apples,"[{'gramWeight': '113', 'value': 1, 'uom': 'cup'}]",Apples,"Apples, fuji, with skin, raw"


### Filter out items not found in USDA data

In [66]:
df_filtered = df[df['fdcId'].notnull()]
df_filtered.head(10)

Unnamed: 0,fdcId,description,foodPortions,search_description,fdcDescription
2,789890,All-Purpose Flour,"[{'gramWeight': '120', 'value': 1, 'uom': 'cup'}]",All-Purpose Flour,"Flour, wheat, all-purpose, enriched, bleached"
3,2261420,Almond Flour,"[{'gramWeight': '96', 'value': 1, 'uom': 'cup'}]",Almond Flour,"Flour, almond"
8,2346393,"Almonds, whole","[{'gramWeight': '142', 'value': 1, 'uom': 'cup'}]","Almonds, whole","Nuts, almonds, whole, raw"
9,2512371,Amaranth flour,"[{'gramWeight': '103', 'value': 1, 'uom': 'cup'}]",Amaranth flour,"Flour, amaranth"
10,2003590,Apple juice concentrate,"[{'gramWeight': '70', 'value': 0.25, 'uom': 'c...",Apple juice concentrate,"Apple juice, with added vitamin C, from concen..."
12,1750340,Apples,"[{'gramWeight': '113', 'value': 1, 'uom': 'cup'}]",Apples,"Apples, fuji, with skin, raw"
13,2346414,Applesauce,"[{'gramWeight': '255', 'value': 1, 'uom': 'cup'}]",Applesauce,"Applesauce, unsweetened, with added vitamin C"
23,1105073,Bananas,"[{'gramWeight': '227', 'value': 1, 'uom': 'cup'}]",Bananas,"Bananas, overripe, raw"
27,2512376,Barley flour,"[{'gramWeight': '85', 'value': 1, 'uom': 'cup'}]",Barley flour,"Flour, barley"
30,2258588,Bell peppers,"[{'gramWeight': '142', 'value': 1, 'uom': 'cup'}]",Bell peppers,"Peppers, bell, green, raw"


### Export to CSV for manual review

In [68]:
df_filtered.to_json('king_arthur_data.json', orient='records')