In [19]:
import requests
from bs4 import BeautifulSoup
import json
import random
import time
import pandas as pd

base ="https://www.halfbakedharvest.com/sitemap_index.xml"
header = {"User-Agent": "ingredient_bot/1.0 (+mailto:jakehowell@duck.com; for educational purposes)"}

In [20]:
def parseXML(url):
    site_mapXML = requests.get(url, headers = header,)

    soup = BeautifulSoup(site_mapXML.text, "xml")

    site_maps = []
    for loc_tag in soup.find_all("loc"):
        loc_tag = loc_tag.get_text(strip = True)
        if loc_tag[-3:] == "jpg":
            continue
        elif loc_tag[-3:] != "jpg" and loc_tag not in site_maps:
            site_maps.append(loc_tag)
    return site_maps

In [21]:
site_map = parseXML(base)
print(site_map)


['https://www.halfbakedharvest.com/post-sitemap.xml', 'https://www.halfbakedharvest.com/post-sitemap2.xml', 'https://www.halfbakedharvest.com/post-sitemap3.xml', 'https://www.halfbakedharvest.com/post-sitemap4.xml', 'https://www.halfbakedharvest.com/post-sitemap5.xml', 'https://www.halfbakedharvest.com/page-sitemap.xml', 'https://www.halfbakedharvest.com/web-story-sitemap.xml', 'https://www.halfbakedharvest.com/category-sitemap.xml', 'https://www.halfbakedharvest.com/video-sitemap.xml']


In [22]:
links = []
links.append(parseXML(site_map[1]))
len(links[0])
first_ten_links = links[0][0:10]
first_ten_links

['https://www.halfbakedharvest.com/strawberry-margarita-cream-filled-donuts/',
 'https://www.halfbakedharvest.com/teriyaki-chicken-sub-sandwiches/',
 'https://www.halfbakedharvest.com/tortilla-chip-crusted-chicken-salad-with-avocado-chipotle-lime-dressing-and-queso-fresco/',
 'https://www.halfbakedharvest.com/margarita-chicken-quesadilla-with-margarita-guacamole/',
 'https://www.halfbakedharvest.com/crunchy-oatmeal-peanut-butter-oats-n-honey-bars/',
 'https://www.halfbakedharvest.com/brown-butter-crepe-ice-cream-tacos/',
 'https://www.halfbakedharvest.com/whipped-feta-and-roasted-jalapeno-greek-pizza/',
 'https://www.halfbakedharvest.com/jerk-chicken-with-fired-rice-and-grilled-pineapple-salsa/',
 'https://www.halfbakedharvest.com/chocolate-peanut-butter-swirl-fudge-brownie-ice-cream/',
 'https://www.halfbakedharvest.com/cinnamon-crunch-bagels/']

In [31]:
df = pd.read_csv("output.csv")
df.head()

Unnamed: 0,recipe_name,date_published,url,ingredients,Keywords


In [32]:
df.columns

Index(['recipe_name', 'date_published', 'url', 'ingredients', 'Keywords'], dtype='object')

In [33]:
s = requests.Session()
counter = 0

for url in links[0]:
    if url in set(df['url'].values):
        print(f"Url: {url}, already exists in dataset")
        continue
    resp = s.get(url, headers = header)
    soup = BeautifulSoup(resp.text, "html.parser")

    scripts = soup.find("script", type = "application/ld+json")
    data = json.loads(scripts.string)
    info = data["@graph"][0]
    headline = info["headline"]
    keywords = info["keywords"]
    date_published = info["datePublished"]

    ingredients = soup.find_all("span", class_ = "wprm-recipe-ingredient-name")
    ingredient_list = [span.get_text(strip=True) for span in ingredients]
    
    if url not in df['url']: 
        df.loc[len(df)] = [headline, date_published, url, ingredient_list, keywords]

    resp.raise_for_status()  # raise for 4xx/5xx
    print(f"Success: {url} (status {resp.status_code})")

    counter += 1
    if counter > 10:
        break
    else:
        print(f"{10 - counter} more links.")

    delay = random.uniform(1, 5)
    print(f"Sleeping {delay:.2f}s before next request...")
    time.sleep(delay)


Success: https://www.halfbakedharvest.com/strawberry-margarita-cream-filled-donuts/ (status 200)
9 more links.
Sleeping 1.78s before next request...
Success: https://www.halfbakedharvest.com/teriyaki-chicken-sub-sandwiches/ (status 200)
8 more links.
Sleeping 3.41s before next request...
Success: https://www.halfbakedharvest.com/tortilla-chip-crusted-chicken-salad-with-avocado-chipotle-lime-dressing-and-queso-fresco/ (status 200)
7 more links.
Sleeping 2.83s before next request...
Success: https://www.halfbakedharvest.com/margarita-chicken-quesadilla-with-margarita-guacamole/ (status 200)
6 more links.
Sleeping 2.01s before next request...
Success: https://www.halfbakedharvest.com/crunchy-oatmeal-peanut-butter-oats-n-honey-bars/ (status 200)
5 more links.
Sleeping 2.39s before next request...
Success: https://www.halfbakedharvest.com/brown-butter-crepe-ice-cream-tacos/ (status 200)
4 more links.
Sleeping 3.05s before next request...
Success: https://www.halfbakedharvest.com/whipped-fet

In [34]:
df.to_csv("output.csv", index= False)

In [36]:
df.head()

Unnamed: 0,recipe_name,date_published,url,ingredients,Keywords
0,Strawberry Margarita Cream Filled Donuts,2013-05-03T10:00:51+00:00,https://www.halfbakedharvest.com/strawberry-ma...,"[milk, sugar, instant or active dry yeast, who...","[cinco de mayo, donut, doughnut, margarita, sp..."
1,Teriyaki Chicken Sub Sandwiches,2013-05-02T10:00:06+00:00,https://www.halfbakedharvest.com/teriyaki-chic...,"[boneless skinless chicken, red bell pepper, a...","[asian, Chicken, Chinese, dinner teriyaki, lun..."
2,Tortilla Chip Crusted Chicken Salad with Avoca...,2013-05-01T10:00:16+00:00,https://www.halfbakedharvest.com/tortilla-chip...,"[boneless, salt, pepper, egg whites, finely cr...","[avocado, Chicken, chipotle, cinco de mayo, Ea..."
3,Margarita Chicken Quesadilla with Margarita Gu...,2013-04-29T10:00:14+00:00,https://www.halfbakedharvest.com/margarita-chi...,"[boneless skinless chicken breast or tenders, ...","[avocado, Chicken, cinco de mayo, dinner, guac..."
4,Crunchy Oatmeal Peanut Butter Oats &#8216;n Ho...,2013-04-28T10:00:28+00:00,https://www.halfbakedharvest.com/crunchy-oatme...,"[rolled oats, rice crispies cereal, roasted pe...","[bars, crunchy, Healthy, nature valley, oatmea..."
