In [1]:
import requests
from bs4 import BeautifulSoup
import json
import random
import time
import pandas as pd

base ="https://www.halfbakedharvest.com/sitemap_index.xml"
header = {"User-Agent": "ingredient_bot/1.0 (+mailto:jakehowell@duck.com; for educational purposes)"}

In [2]:
def parseXML(url):
    site_mapXML = requests.get(url, headers = header,)

    soup = BeautifulSoup(site_mapXML.text, "xml")

    site_maps = []
    for loc_tag in soup.find_all("loc"):
        loc_tag = loc_tag.get_text(strip = True)
        if loc_tag[-3:] == "jpg":
            continue
        elif loc_tag[-3:] != "jpg" and loc_tag not in site_maps:
            site_maps.append(loc_tag)
    return site_maps

In [3]:
site_map = parseXML(base)
print(site_map)


['https://www.halfbakedharvest.com/post-sitemap.xml', 'https://www.halfbakedharvest.com/post-sitemap2.xml', 'https://www.halfbakedharvest.com/post-sitemap3.xml', 'https://www.halfbakedharvest.com/post-sitemap4.xml', 'https://www.halfbakedharvest.com/post-sitemap5.xml', 'https://www.halfbakedharvest.com/page-sitemap.xml', 'https://www.halfbakedharvest.com/web-story-sitemap.xml', 'https://www.halfbakedharvest.com/category-sitemap.xml', 'https://www.halfbakedharvest.com/video-sitemap.xml']


In [4]:
links = []
links.append(parseXML(site_map[1]))
len(links[0])
first_ten_links = links[0][0:10]
first_ten_links

['https://www.halfbakedharvest.com/strawberry-margarita-cream-filled-donuts/',
 'https://www.halfbakedharvest.com/teriyaki-chicken-sub-sandwiches/',
 'https://www.halfbakedharvest.com/tortilla-chip-crusted-chicken-salad-with-avocado-chipotle-lime-dressing-and-queso-fresco/',
 'https://www.halfbakedharvest.com/margarita-chicken-quesadilla-with-margarita-guacamole/',
 'https://www.halfbakedharvest.com/crunchy-oatmeal-peanut-butter-oats-n-honey-bars/',
 'https://www.halfbakedharvest.com/brown-butter-crepe-ice-cream-tacos/',
 'https://www.halfbakedharvest.com/whipped-feta-and-roasted-jalapeno-greek-pizza/',
 'https://www.halfbakedharvest.com/jerk-chicken-with-fired-rice-and-grilled-pineapple-salsa/',
 'https://www.halfbakedharvest.com/chocolate-peanut-butter-swirl-fudge-brownie-ice-cream/',
 'https://www.halfbakedharvest.com/cinnamon-crunch-bagels/']

In [5]:
df = pd.read_csv("output.csv")
df.head()

Unnamed: 0,url,recipe_name,ingredients
0,https://www.halfbakedharvest.com/strawberry-ma...,Strawberry Margarita Cream Filled Donuts,"['1 1/8 cups milk (warm)', '1/4 cup sugar', '2..."
1,https://www.halfbakedharvest.com/teriyaki-chic...,Teriyaki Chicken Sub Sandwiches,['1 pound boneless skinless chicken (cut into ...
2,https://www.halfbakedharvest.com/tortilla-chip...,Tortilla Chip Crusted Chicken Salad with Avoca...,"['4 boneless (skinless chicken breasts, cut i..."
3,https://www.halfbakedharvest.com/margarita-chi...,Margarita Chicken Quesadilla with Margarita Gu...,['1 pound boneless skinless chicken breast or ...
4,https://www.halfbakedharvest.com/crunchy-oatme...,Crunchy Oatmeal Peanut Butter Oats &#8216;n Ho...,['3 cups rolled oats (use gluten free oats for...


In [6]:
s = requests.Session()
counter = 0

for url in links[0]:
    if url in set(df['url'].values):
        print(f"Url: {url}, already exists in dataset")
        continue
    resp = s.get(url, headers = header)
    soup = BeautifulSoup(resp.text, "html.parser")

    scripts = soup.find("script", type = "application/ld+json")
    data = json.loads(scripts.string)

    all_recipe_data = data["@graph"]
    for obj in all_recipe_data:
        if obj["@type"] == "Recipe":
            recipe = obj
    ingredients = recipe["recipeIngredient"]

    name = all_recipe_data[0]["headline"]
    
    if url not in df['url']: 
        df.loc[len(df)] = [url, name, ingredients]

    resp.raise_for_status()  # raise for 4xx/5xx
    print(f"Success: {url} (status {resp.status_code})")

    counter += 1
    if counter > 1:
        break
    else:
        print(f"{10 - counter} more links.")

    delay = random.uniform(1, 5)
    print(f"Sleeping {delay:.2f}s before next request...")
    time.sleep(delay)



Url: https://www.halfbakedharvest.com/strawberry-margarita-cream-filled-donuts/, already exists in dataset
Url: https://www.halfbakedharvest.com/teriyaki-chicken-sub-sandwiches/, already exists in dataset
Url: https://www.halfbakedharvest.com/tortilla-chip-crusted-chicken-salad-with-avocado-chipotle-lime-dressing-and-queso-fresco/, already exists in dataset
Url: https://www.halfbakedharvest.com/margarita-chicken-quesadilla-with-margarita-guacamole/, already exists in dataset
Url: https://www.halfbakedharvest.com/crunchy-oatmeal-peanut-butter-oats-n-honey-bars/, already exists in dataset
Url: https://www.halfbakedharvest.com/brown-butter-crepe-ice-cream-tacos/, already exists in dataset
Url: https://www.halfbakedharvest.com/whipped-feta-and-roasted-jalapeno-greek-pizza/, already exists in dataset
Url: https://www.halfbakedharvest.com/jerk-chicken-with-fired-rice-and-grilled-pineapple-salsa/, already exists in dataset
Url: https://www.halfbakedharvest.com/chocolate-peanut-butter-swirl-fu

In [7]:
df.to_csv("output.csv", index= False)

In [8]:
ing_df = df['ingredients']
ing_df[0]

"['1 1/8 cups milk (warm)', '1/4 cup sugar', '2 1/4 teaspoons instant or active dry yeast (one package)', '2  whole large eggs (beaten)', '1 1/4  stick unsalted butter (melted)', '4 cups all-purpose flour', '1/4 teaspoon salt', '1 1/2 cups whole milk', '1/2 cup granulated sugar', '1/4 cup cake flour', '1/2 teaspoon salt', '4  large egg yolks', '1 teaspoon vanilla extract', '1/2  lime (juice)', '1 tablespoon tequila (optional)', '2 1/2 tablespoons strawberry fruit spread or jam', '1/2 cup cold heavy cream', '1 cup stick butter (softened, 1/2)', '4  strawberries', '1 tablespoon strawberry fruit spread or jam', '1-2 tablespoons milk', '1-2 tablespoons tequila', '1 teaspoon vanilla extract', '1/2  lime (juiced)', '2 1/2 cups powdered sugar', '1 cup powdered suger', '1  lime (juiced)', '1 tablespoon tequila (optional)', 'salt for sprinkling']"

In [9]:
resp = requests.get("https://www.halfbakedharvest.com/whole-wheat-graham-cracker-banana-bread/", headers= header)

soup = BeautifulSoup(resp.text, "html.parser")
ingredients = soup.find_all("span", class_ = "wprm-recipe-ingredient-name")

In [10]:
ingredient_list = [span.get_text(strip=True) for span in ingredients]
print(ingredient_list)

['[graham cracker crumbs|https://www.halfbakedharvest.com/honey-bunny-grahams/]', 'whole wheat pastry flour or white whole wheat flour', 'baking soda', 'salt', 'cinnamon', 'eggs', 'loosely packed brown sugar', 'canned coconut milk', 'canola oil', 'very ripe bananas', 'vanilla extract']
