# Recipe scraping
This script scrapes recipetineats.com to find ingredients for every recipe in each cuisine category on the site. It creates csv files listing the ingredients and quantities for each cuisine for use in association rule mining.  

The follow up to this script is the "recipe association rule mining" script which uses the apriori algorithm to find the most common groupings of ingredients in each cuisine.

# Import packages

In [None]:
# Import packages
import pandas as pd
import numpy as np
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
# import mechanicalsoup
import lxml.html as html
import requests
import time
#browser = mechanicalsoup.Browser()
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', -1)

# Scraping script

In [1]:
# Set url for home page of cuisine based on user input
cuisine = input("What cuisine do you want to explore? Choose from Chinese, Korean, Thai, Vietnamese, Indian, Italian, French, Greek, Mediterranean, South-american, Mexican, Middle-eastern. (Include hyphens).")
url = "https://www.recipetineats.com/category/" + cuisine + "-recipes/"

# Set url and parse html
page = urlopen(url)
html_1 = page.read().decode('utf-8')
soup = BeautifulSoup(html_1, 'html.parser')

# Create list of urls within cuisine category
# This is needed because not all recipes will be found on the home page
n = 1
all_pages = [url]
for n in range(15):
# Used range of 15 because no cuisines have more than 15 pages
    for link in soup.find_all("link", rel = "next"):
        all_pages.append(link["href"])
        url = all_pages[n+1]
        page = urlopen(url)
        html_1 = page.read().decode('utf-8')
        soup = BeautifulSoup(html_1, 'html.parser')
        n = n + 1

# Create list of recipe pages
recipe_pages = []
for recipe in all_pages:        
    url = recipe
    page = urlopen(url)
    html_1 = page.read().decode('utf-8')
    soup = BeautifulSoup(html_1, 'html.parser')

    # Loop through html adding recipe links to 'recipe_pages' variable
    for links in soup.find_all("a", rel = "bookmark"):
        recipe_pages.append(links["href"])


# Build list of ingredients, amounts and associated servings
recipes_list = []
for n in recipe_pages:
    page = requests.get(n)
    pagehtml = html.fromstring(page.content)
    recipe_title = (pagehtml.xpath("//title")[0].text_content().replace(" | RecipeTin Eats", ""))
    try: # This deals with the fact that servings are held within one of two html structures
        servings = int(pagehtml.xpath("//*[@class='wprm-recipe-servings-with-unit']")[0][0].text_content())
    except IndexError:
            try:
                servings = int(pagehtml.xpath("//div[@class='wprm-entry-servings']")[0][0][1].text_content())
            except IndexError:
                pass
    pass
    recipe_ingredients = [
        {
            "meal": recipe_title,
            'servings': servings,
            "name": i.xpath("*[@class='wprm-recipe-ingredient-name']")[0].text_content()
                if i.xpath("*[@class='wprm-recipe-ingredient-name']")
                else np.nan, 
            "amount": i.xpath("*[@class='wprm-recipe-ingredient-amount']")[0].text_content()
                if i.xpath("*[@class='wprm-recipe-ingredient-amount']")
                else np.NAN,
            "unit": i.xpath("*[@class='wprm-recipe-ingredient-unit']")[0].text_content()
                if i.xpath("*[@class='wprm-recipe-ingredient-unit']")
                else np.NAN,
            "notes": i.xpath(
                    "*[@class='wprm-recipe-ingredient-notes wprm-recipe-ingredient-notes-faded']"
                )[0]
                .text_content()
                .lstrip(", ")
                if i.xpath(
                    "*[@class='wprm-recipe-ingredient-notes wprm-recipe-ingredient-notes-faded']"
                )
                else np.NAN,
        }
        for i in pagehtml.xpath("//*[@class='wprm-recipe-ingredients']/*")
        ]
    recipes_list.extend(recipe_ingredients)

# Create dataframe from list
data = pd.DataFrame(recipes_list)

# Check number of meals captured (used when manually verifying all pages were scraped)
print(f'number of meals scraped: {data.meal.nunique()}')

# Rearrange dataframe columns
data = data[['meal', 'servings', 'name', 'amount', 'unit', 'notes']]

# Export to csv
data.to_csv(f"{cuisine}.csv", index = False)