In [1]:
import requests
import re
import numbers
import pandas as pd
import numpy as np
import sqlite3
from bs4 import BeautifulSoup
from p2func import *

In [33]:
sqlitefile = 't_homechef.sqlite'
url = "https://www.homechef.com/recipes"

# Web Scraping

### Batch add meals by entering the website with the categories listed. 

In [35]:
scrape_homechef(sqlitefile, url)

Main recipe_page: 200

Category: 200 vegetarian

Category: 200 without-nuts

Category: 200 poultry

Category: 200 without-soy

Category: 200 without-wheat

Category: 200 customer-favorites

Category: 200 seafood

Category: 200 pork

Category: 200 calorie-conscous-and-carb-conscious

Category: 200 staff-picks

Category: 200 without-milk

Category: 200 breakfast
138 meals added to database


### Add meal by entering it's page url only: 

In [36]:
page_url = 'https://www.homechef.com/meals/spooky-ground-beef-cottage-pie'
m_rating = 4
ctgry = 'beef'

add_single_meal(sqlitefile, page_url, ctgry, m_rating)

Meal ID 4287 Added!


# WEB SCRAPING FUNCTIONS
    - get_meal_overview
    - get_single_meal_id
    - get_meal_overview_and_ingredients
    - scrape_homechef

In [34]:
def get_meal_overview(meal_ov, idx, w0, w1):
#    print('get_meal_overview')
    while idx < len(meal_ov):
        mov = meal_ov[idx].get_text().lower().split()
        if mov != [] and (mov[0] == w0 and mov[1] == w1):
            return meal_ov[idx+1].get_text().lower().split()[0], idx+1
        idx += 1
    return 0

def get_single_meal_id(meal):
    m_id = meal.select('a.button.link--inverse.list--share__link')
    s = m_id[0].get('href')
    r = re.search(r"/\d+/",s)
    if r:
        return s[r.start():r.end()].rstrip('/').lstrip('/')
    else:
        print('Meal_id not found')
        return np.nan

def get_meal_overview_and_ingredients(meal, mId):

    # Meal Overview
    meal_ov = meal.select('div.meal__overviewItem span')
    if meal_ov == []: # Some cases where the recipe is so logic that doesnt need info (i.e spring fruit basket)
        return [0,0]
    ## 2 is time, 4 is within days, 6 or 7 is level, 13 or 14 is spicy 18 ends
    time_spent, idx = get_meal_overview(meal_ov, 0, 'cook', 'time:')
    expire,     idx = get_meal_overview(meal_ov, idx, 'cook','within:')
    exp_level,  idx = get_meal_overview(meal_ov, idx,'difficulty', 'level:')
    spicy,      idx = get_meal_overview(meal_ov, idx,'spice', 'level:')

    # Meal Ingredients
    meal_ing = meal.select('ul.list--unstyled.group.position--relative.text--center--bpDown2 li')
    ing_list = []
    for ingredient in meal_ing:
        ### example of ingredient: ['info', '3¾', 'oz.', "font'ina", 'cheese', 'slices']
        ing = ingredient.get_text().lower().replace("info","").replace("'","_").split()
        ing_list.append(' '.join(ing))
    return ing_list, [time_spent, expire, exp_level, spicy]

def scrape_homechef(sqlitefile, url):
    # SQL VARIABLES FOR MEAL_INFO TABLE
    table_name           = 'meal_info'  # name of the table to be created
    id_field             = 'meal_id'    # name of the ID column
    title_col            = 'title'
    rating_col           = 'rating'     # integer counting the starts [0,5]
    duration_col         = 'duration'   # text for now -> integer counting time in minutes 
    expiration_col       = 'expiration' # Integer counting in days
    expertise_col        = 'exp_level'  # Text -> Categorical 
    spicy_col            = 'spiciness'  # Text -> Categorical
    cat_col              = 'category'   # Text -> Categorical

    # SQL VARIABLES FOR INGREDIENTS TABLE
    table_name_2         = 'meal_ingredients' 
    id_column            = 'meal_id'
    ing_name             = 'ingredients' # name of the column

    field_type_1         = 'INTEGER'
    field_type_2         = 'TEXT'  # column data type

    recipe_page = requests.get(url)
    print('Main recipe_page:',recipe_page.status_code)
    recipes = BeautifulSoup(recipe_page.content, 'html.parser')

    # FIND THE LINKS CONTAINING THE CATEGORY ENDPOINTS AND BUILD THE LINK
    ctgry_list = recipes.select("a.size--xs")
    ctgry_url = [url+'/'+category.get('href')[9:] for category in ctgry_list]
    #print('Category_URL',ctgry_url, len(ctgry_url))

    # OPEN CONNECTION TO DATABASE TO START INSERTING VALUES
    conn = sqlite3.connect(sqlitefile)
    c = conn.cursor()
    total_meals = 0

    for ctgry in ctgry_url:
        cat_url = requests.get(ctgry)
        print('\nCategory:',cat_url.status_code, ctgry[33:])

    # GET THE MEALS ID'S, TITLES AND ENDPOINTS

    # Strip from the url the word "recipe" as it's no longer needed to access single meals pages
        new_url = url[:-8]  


        soup            = BeautifulSoup(cat_url.content, 'html.parser')
        meal_id_list    = soup.select("article.card")
        meal_url_list   = soup.select("article.card a")
        meal_title_list = soup.select("div.card__body h2")
        meal_co_list    = soup.select("div.card__body p")

        meal_list       = [[meal_id_list[m].get("id"), 
                            meal_title_list[m].get_text().replace("'","_").replace(",","").replace('"',''),
                            meal_co_list[m].get_text().replace("'","_").replace(",","").replace("\t", ""),
                            meal_url_list[m].get("href")] 
                           for m in range(len(meal_title_list))]


        # START SCRAPING MEALS FOR THIS CATEGORY
        for indx in range(len(meal_url_list)):

            # Build the new url leading to the single meal page of given category
            meal_url = requests.get(new_url+meal_url_list[indx].get("href")) 

            # LOADED PAGE CHECK
            if meal_url.status_code != 200:
                print('\n Error loading ',indx, meal_url.status_code, new_url+meal_list[indx][-1])
                break

            meal = BeautifulSoup(meal_url.content,'html.parser')

            # GET THE MEAL'S PARAMETERS AND INGREDIENTS:
            # ingredients = [qty, mUnit, key_type, key_ing, key2_ing] 
            # and overtime = [time_spent, expire, exp_level, spicy]
            ingredients, overview = get_meal_overview_and_ingredients(meal, meal_list[indx][0][4:])

            # DATA CHECK
            if not(ingredients or overview):
                #print('No ingredients info found for', meal_list[indx][1])
                continue

            # INSERT OVERVIEW RECORD IN MEAL_INFO TABLE THE VALUES 
            # Meal Id, Meal Name, Rating, Duration, Expiration Time, Difficulty,Spice level and Category. 
            # SQL INSERT QUERY
            ins_query = "INSERT INTO {tn} ({idf}, {tc}, {rc}, {dc}, {ec}, {ex}, {sc}, {ct})\
                 VALUES ('{mid}', '{tv}', '{star}', '{ov0}', '{ov1}', '{ov2}', '{ov3}', '{cat}')".\
            format(tn=table_name,   idf=id_field,       tc=title_col,     rc=rating_col, 
                   dc=duration_col,  ec=expiration_col, ex=expertise_col, sc=spicy_col, ct=cat_col, 
                   mid=meal_list[indx][0][4:], 
                   tv=meal_list[indx][1]+' '+meal_list[indx][2], star=None,
                   ov0=overview[0], ov1=overview[1], ov2=overview[2], ov3=overview[3], cat=ctgry[33:])

            try: 
                c.execute(ins_query)
                #print(meal_list[indx][0][4:],meal_list[indx][1]+' '+meal_list[indx][2], '..added')
                total_meals += 1
                meal_added = True   

            except Exception as oe:
                #print(meal_list[indx][0][4:],meal_list[indx][1]+' '+meal_list[indx][2], 'was not added\n',oe)
                meal_added = False

            # INSERT INGREDIENTS RECORD IN MEAL_INGREDIENTS TABLE
            if meal_added:
                for item in ingredients:
                    c.execute("INSERT OR IGNORE INTO {tn} ({idf}, {tc}) VALUES ('{mid}', '{m_ing}')".\
                            format(tn=table_name_2,idf=id_column, tc=ing_name,
                            mid=meal_list[indx][0][4:],m_ing=item)) 
    print(total_meals, 'meals added to database')
    conn.commit()
    conn.close()

def add_single_meal(sqlitefile, page_url, ctgry, m_rating=0):
    # Build the new url leading to the single meal page of given category
    meal_url = requests.get(page_url) 

    # LOADED PAGE CHECK
    if meal_url.status_code != 200:
        print('\n Error loading ',indx, meal_url.status_code, new_url+meal_list[indx][-1])
        sys.exit()

    meal = BeautifulSoup(meal_url.content,'html.parser')

    # GET SINGLE MEAL ID
    m_id = get_single_meal_id(meal)

    # GET MEAL TITLE
    m_title = meal.select('header.meal__header.order--1.group h1')[0].get_text()
    m_sub = meal.select('header.meal__header.order--1.group h2')[0].get_text()

    # GET THE MEAL'S PARAMETERS AND INGREDIENTS:
    # ingredients = [qty, mUnit, key_type, key_ing, key2_ing] 
    # and overtime = [time_spent, expire, exp_level, spicy]
    ingredients, overview = get_meal_overview_and_ingredients(meal, m_id)

    # DATA CHECK
    if not(ingredients or overview):
        print('No ingredients info found for', meal_list[indx][1])
        total_meals -= 1
        sys.exit()

    # OPEN CONNECTION TO DATABASE TO START INSERTING VALUES
    conn = sqlite3.connect(sqlitefile)
    c = conn.cursor()

    # INSERT OVERVIEW RECORD IN MEAL_INFO TABLE THE VALUES 
    # Meal Id, Meal Name, Rating, Duration, Expiration Time, Difficulty,Spice level and Category. 
    # SQL INSERT QUERY
    ins_query = "INSERT INTO {tn} ({idf}, {tc}, {rc}, {dc}, {ec}, {ex}, {sc}, {ct})\
         VALUES ('{mid}', '{tv}', '{star}', '{ov0}', '{ov1}', '{ov2}', '{ov3}', '{cat}')".\
    format(tn=table_name,   idf=id_field,       tc=title_col,     rc=rating_col, 
           dc=duration_col,  ec=expiration_col, ex=expertise_col, sc=spicy_col, ct=cat_col, 
           mid=m_id, 
           tv=m_title+' '+m_sub, star=m_rating,
           ov0=overview[0], ov1=overview[1], ov2=overview[2], ov3=overview[3], cat=ctgry)

    try: 
        c.execute(ins_query)
        print('Meal ID', m_id, 'Added!')
        meal_added = True   

    except Exception as oe:
        print('Meal ID',m_id, 'was not added\n',oe)
        meal_added = False

    # INSERT INGREDIENTS RECORD IN MEAL_INGREDIENTS TABLE
    if meal_added:
        for item in ingredients:
            c.execute("INSERT OR IGNORE INTO {tn} ({idf}, {tc}) VALUES ('{mid}', '{m_ing}')".\
                    format(tn=table_name_2,idf=id_column, tc=ing_name,
                    mid=m_id,m_ing=item)) 
    conn.commit()
    conn.close()