In [109]:
import pandas as pd 
import requests
import json
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import urllib.request
import time


## AllRecipes.com
* Easily scrapeable
* Has good images
* Has nutrition

In [6]:
recipe_page = "https://www.allrecipes.com/saucy-vegetable-pinwheels-recipe-7568002"

In [7]:
food_pages_soup = BeautifulSoup(requests.get(recipe_page).text, 'html.parser')

### Details

In [138]:
food_pages_soup.find("div",attrs={"id":"recipe-details_1-0"}).findAll("div",attrs={"class":"mntl-recipe-details__item"})

[<div class="mntl-recipe-details__item">
 <div class="mntl-recipe-details__label">Prep Time:</div>
 <div class="mntl-recipe-details__value">10 mins</div>
 </div>,
 <div class="mntl-recipe-details__item">
 <div class="mntl-recipe-details__label">Cook Time:</div>
 <div class="mntl-recipe-details__value">1 mins</div>
 </div>,
 <div class="mntl-recipe-details__item">
 <div class="mntl-recipe-details__label">Total Time:</div>
 <div class="mntl-recipe-details__value">11 mins</div>
 </div>,
 <div class="mntl-recipe-details__item">
 <div class="mntl-recipe-details__label">Servings:</div>
 <div class="mntl-recipe-details__value">2 </div>
 </div>,
 <div class="mntl-recipe-details__item">
 <div class="mntl-recipe-details__label">Yield:</div>
 <div class="mntl-recipe-details__value">2 wraps</div>
 </div>]

In [144]:
def get_details(page):
    return {itm.find("div",attrs={"class":"mntl-recipe-details__label"}).text : itm.find("div",attrs={"class":"mntl-recipe-details__value"}).text \
        for itm in page.findAll("div",attrs={"class":"mntl-recipe-details__item"}) }

### Ingredients

In [145]:
ingredient_itms = food_pages_soup.find("ul",attrs={"class":"mntl-structured-ingredients__list"})\
    .findAll("li",attrs={"class":"mntl-structured-ingredients__list-item"})

In [146]:
def parse_ingred(ingredient_itm):
    ingred_obj = {}
    qty = ingredient_itm.find("span",attrs={"data-ingredient-quantity":"true"})
    ingred_obj["amt"] = qty.text if qty else 1 
    unit = ingredient_itm.find("span",attrs={"data-ingredient-unit":"true"})
    ingred_obj["unit"] =unit.text if unit else "item"
    ingred_obj["ingred"] = ingredient_itm.find("span",attrs={"data-ingredient-name":"true"}).text
    return ingred_obj

### Instructions

In [22]:
instructions = [p.text.replace("\n"," ").strip() for p in food_pages_soup.find("div",attrs={"id":"recipe__steps-content_1-0"}).findAll("p")]

In [150]:
def scrape_all_recipes_page(url):
    page = requests.get(url)

    food_pages_soup = BeautifulSoup(page.text, 'html.parser')
    time.sleep(0.5)
    img_link = food_pages_soup.find("img",attrs={"sizes":"750px"})["src"]
    steps = [p.text.replace("\n"," ").strip() for p in food_pages_soup.find("div",attrs={"id":"recipe__steps-content_1-0"}).findAll("p")]
    ingredient = [parse_ingred(obj) for obj in food_pages_soup.find("ul",attrs={"class":"mntl-structured-ingredients__list"})\
        .findAll("li",attrs={"class":"mntl-structured-ingredients__list-item"})]
    name = food_pages_soup.find("h1",attrs={"id":"article-heading_1-0"}).text.replace("\n","").strip()
    details = get_details(food_pages_soup)
    ret_obj = {"name":name,
        "steps":steps,
        "ingredients":ingredient,
        "img":img_link}
    ret_obj.update(details)
    return ret_obj

In [151]:
scrape_all_recipes_page("https://www.allrecipes.com/saucy-vegetable-pinwheels-recipe-7568002")

{'name': 'Saucy Vegetable Pinwheels',
 'steps': ['For sauce, process roasted red peppers, yogurt, orange zest (or other CITRUS ZEST), orange juice (or other CITRUS JUICE), chopped parsley (or other FRESH HERB), salt, and black pepper in a food processor until smooth.',
  'Spread tortillas evenly with sauce and top with arugula and carrots (or other 2 VEGETABLES). Roll up and wrap tightly in plastic wrap. Chill at least 1 hour or up to 12 hours. To serve, unwrap tortillas and trim ends. Cut rolls into slices.',
  '8 cups arugula and/or spring mix',
  '4 cups shredded red cabbage',
  '2 red bell peppers, julienned (2 cups)',
  '2 zucchini, sliced (4 cups)',
  '2 large carrots, julienned (2 cups)',
  '4 ounces radishes, sliced (1 cup)',
  'LIME-CABBAGE',
  'lime zest + lime juice + cilantro + red cabbage + red bell pepper',
  'LEMON-ZUCCHINI',
  'lemon zest + lemon juice + chives + zucchini +radishes'],
 'ingredients': [{'amt': '1', 'unit': 'cup', 'ingred': 'roasted red peppers'},
  {'amt

### Crawl Pages

In [152]:
root_url = "https://www.allrecipes.com/recipes/"

In [153]:
recipe_objs = []
nav_links = ["https://www.allrecipes.com/" ]
recipe_links = []
scraped = set()
while (nav_links or recipe_links) and len(recipe_objs) <1000:
    if recipe_links:
        link = recipe_links.pop()
    else:
        link = nav_links.pop()
    print(f"Parsing {link}")
    scraped.add(link)
    page = BeautifulSoup(requests.get(link).text, 'html.parser')
    if page.find("h1",attrs={"id":"article-heading_1-0"}) and page.find("img",attrs={"sizes":"750px"}):
        print("adding recipe")
        try:
            recipe_objs.append(scrape_all_recipes_page(link))
        except AttributeError:
            print(f"Error scraping {link}. Skipping")
    
    else:
        possible_nav_links = [itm["href"] for itm in root_soup.findAll("a") if "https://www.allrecipes.com/recipes/" in itm["href"]]
        possible_recipe_links = [itm["href"] for itm in root_soup.findAll("a") \
                if "https://www.allrecipes.com/" in itm["href"] and "https://www.allrecipes.com/recipes/" not in itm["href"]\
                    and "authentication" not in itm["href"]]
        for recipe_link in possible_recipe_links:
            if recipe_link not in scraped:
                recipe_links.append(recipe_link)

        for new_link in possible_nav_links:
            if new_link not in scraped:
                nav_links.append(new_link)


Parsing https://www.allrecipes.com/
Parsing https://www.allrecipes.com/food-news-trends/
Parsing https://www.allrecipes.com/kitchen-tips/
Parsing https://www.allrecipes.com/grilled-ribeye-steaks-recipe-7552430
adding recipe
Parsing https://www.allrecipes.com/skillet-chicken-breasts-with-cherry-tomatoes-recipe-7553999
adding recipe
Parsing https://www.allrecipes.com/dill-ranch-chicken-wraps-recipe-7552440
adding recipe
Parsing https://www.allrecipes.com/air-fryer-zucchini-fries-recipe-7554904
adding recipe
Parsing https://www.allrecipes.com/simple-homemade-dreamsicles-recipe-7550349
Parsing https://www.allrecipes.com/copycat-wendy-s-ranch-sauce-recipe-7556719
adding recipe
Parsing https://www.allrecipes.com/copycat-chick-fil-a-honey-mustard-recipe-7556788
adding recipe
Parsing https://www.allrecipes.com/chipotle-citrus-marinated-chicken-tacos-recipe-7553405
adding recipe
Parsing https://www.allrecipes.com/pitmaster-potato-salad-recipe-7556905
adding recipe
Parsing https://www.allrecipes

KeyboardInterrupt: 

In [159]:
with open("/mnt/c/Users/james/OneDrive/Desktop/all_recipes.json","w") as f:
    json.dump({"recipes":recipe_objs},f)

In [160]:
recipe_objs[0].keys()

dict_keys(['name', 'steps', 'ingredients', 'img', 'Prep Time:', 'Cook Time:', 'Marinate Time:', 'Stand Time:', 'Rest Time:', 'Total Time:', 'Servings:'])

In [126]:
recipe_objs

[{'name': 'These Grilled Ribeye Steaks Are Juicy and Flavorful',
  'steps': ['Combine lime juice, carne asada seasoning, and oil in a resealable plastic bag. Add ribeye steak, reseal the bag, and marinate in the refrigerator for 30 to 45 minutes.',
   'Drain marinade and allow steak to come to room temperature, about 30 minutes. Blot any excess moisture off the steak.',
   'Preheat the grill to 500 degrees F (260 degrees C), and lightly oil the grate.',
   'Place steak on the grill over direct heat. For rare: cook steaks until browned on the outside and red in the center, about 3 minutes per side. An instant-read thermometer inserted into the center should read 125 degrees F (52 degrees C).',
   'Remove steak from grill; rest 5 minutes before serving.',
   'Additional internal temperatures for cooking steak: Rare: 125-130 degrees F (52-54 degrees C); Medium rare: 130-140 degrees F (54-60 degrees C); Medium:\xa0 140-150 degrees F (54-65 degrees C); Medium well: 150-160 degrees F (65-70 

### Reformat For Prisma Schema Format

In [189]:
def map_to_db(obj):
    new_obj = {}
    new_obj["title"] = obj["name"]
    new_obj["ingredients"] = json.dumps(obj["ingredients"])
    new_obj["instructions"] = json.dumps(obj["steps"])
    if "Prep Time:" in obj:
        new_obj["prep_time"] = obj["Prep Time:"]
    if "Cook Time:" in obj:
        new_obj["cook_time"] = obj["Cook Time:"]
    new_obj["img"] = obj["img"]

    filename = new_obj["title"]
    filename = filename.lower().replace(" ","-")
    img_loc = f"/home/james/hackathon-mvp/prisma/data/avatars/{filename}.png"
    new_obj["img_name"] = filename+".png"
    new_obj["img_loc"] = img_loc
    new_obj["url"] = obj["img"]

    return new_obj


In [190]:
db_objs = [map_to_db(obj) for obj in recipe_objs]

In [191]:
with open("../../../../hackathon-mvp/prisma/data/recipes_seed.json","w") as f:
    json.dump({"recipes":db_objs},f)

In [194]:
db_objs

[{'title': 'These Grilled Ribeye Steaks Are Juicy and Flavorful',
  'ingredients': '[{"amt": "1/2", "unit": "lime", "ingred": ", juiced"}, {"amt": "2", "unit": "teaspoons", "ingred": "seasoning"}, {"amt": "1", "unit": "tablespoon", "ingred": "olive oil"}, {"amt": "1 (8", "unit": "ounce)", "ingred": "ribeye steak"}]',
  'instructions': '["Combine lime juice, carne asada seasoning, and oil in a resealable plastic bag. Add ribeye steak, reseal the bag, and marinate in the refrigerator for 30 to 45 minutes.", "Drain marinade and allow steak to come to room temperature, about 30 minutes. Blot any excess moisture off the steak.", "Preheat the grill to 500 degrees F (260 degrees C), and lightly oil the grate.", "Place steak on the grill over direct heat. For rare: cook steaks until browned on the outside and red in the center, about 3 minutes per side. An instant-read thermometer inserted into the center should read 125 degrees F (52 degrees C).", "Remove steak from grill; rest 5 minutes befo

In [193]:
for obj in db_objs:
    if "prep_time" not in obj:
        print("not prep time")
    if "cook_time" not in obj:
        print("not cook time")

not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time
not cook time


In [176]:
db_objs[0]["title"

'These Grilled Ribeye Steaks Are Juicy and Flavorful'

In [177]:
recipe_objs[0]

{'name': 'These Grilled Ribeye Steaks Are Juicy and Flavorful',
 'steps': ['Combine lime juice, carne asada seasoning, and oil in a resealable plastic bag. Add ribeye steak, reseal the bag, and marinate in the refrigerator for 30 to 45 minutes.',
  'Drain marinade and allow steak to come to room temperature, about 30 minutes. Blot any excess moisture off the steak.',
  'Preheat the grill to 500 degrees F (260 degrees C), and lightly oil the grate.',
  'Place steak on the grill over direct heat. For rare: cook steaks until browned on the outside and red in the center, about 3 minutes per side. An instant-read thermometer inserted into the center should read 125 degrees F (52 degrees C).',
  'Remove steak from grill; rest 5 minutes before serving.',
  'Additional internal temperatures for cooking steak: Rare: 125-130 degrees F (52-54 degrees C); Medium rare: 130-140 degrees F (54-60 degrees C); Medium:\xa0 140-150 degrees F (54-65 degrees C); Medium well: 150-160 degrees F (65-70 degrees

In [178]:
db_objs[1]

{'title': 'Skillet Chicken Breasts with Cherry Tomatoes',
 'ingredients': '[{"amt": "1 1/2", "unit": "pounds", "ingred": "boneless, skinless chicken breasts"}, {"amt": "1/2", "unit": "teaspoon", "ingred": "smoked paprika"}, {"amt": 1, "unit": "item", "ingred": "salt and freshly ground black pepper to taste"}, {"amt": "2 ", "unit": "tablespoons", "ingred": "herb-infused olive oil"}, {"amt": "1", "unit": "tablespoon", "ingred": "unsalted butter"}, {"amt": "1/2", "unit": "item", "ingred": "red onion, thinly sliced"}, {"amt": "3", "unit": "cloves", "ingred": "garlic, minced"}, {"amt": "1/2", "unit": "cup", "ingred": "white wine"}, {"amt": "3", "unit": "cups", "ingred": "cherry tomatoes, halved"}, {"amt": "1/4", "unit": "teaspoon", "ingred": "red pepper flakes, or to taste"}, {"amt": "1/2", "unit": "cup", "ingred": "cream, or to taste"}, {"amt": "1", "unit": "tablespoon", "ingred": "fresh basil, plus more for garnish (optional)"}, {"amt": "1/4", "unit": "cup", "ingred": "grated Parmesan che

In [180]:
for obj in db_objs:
    filename = obj["title"]
    filename = filename.lower().replace(" ","-")
    urllib.request.urlretrieve(obj["img"], f"../../../../hackathon-mvp/prisma/data/avatars/{filename}.png")