In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import json

def create_dict_from_pickle(filename):
    return pd.read_pickle(filename)

def filter_old_html(pages_dict, pattern='\r\n\r\n\r\n<!DOCTYPE html>'):
    recipes_dict = {k: v for k, v in pages_dict.items() if not v.startswith(pattern)}
    return recipes_dict

def filter_non_recipes(pages_dict, pattern=".*www.allrecipes.com/recipe/.*"):
    pattern = re.compile(pattern)
    recipes_dict = {k: v for k, v in pages_dict.items() if pattern.match(k) and
                    k != 'https://www.allrecipes.com/'}
    return recipes_dict
    
def make_soup(recipe_tuple):
    return (recipe_tuple[0], BeautifulSoup(recipe_tuple[1],"html.parser"))

def get_recipe_json(soup):
    recipe_json = json.loads(soup.find('script',  type="application/ld+json").text)
    return recipe_json

def fetch_title(recipe_json):
    return recipe_json[1]['name']

def fetch_ingredients(recipe_json):
    return recipe_json[1]['recipeIngredient']

def fetch_steps(recipe_json):
     # Ingredients list
    steps_list = recipe_json[1]['recipeInstructions']
    return [i['text'] for i in steps_list]

def fetch_description(recipe_json):
    return recipe_json[1]['description']

def fetch_prep_time(recipe_json):
    return recipe_json[1]['totalTime']

def fetch_categories(recipe_json):
    return recipe_json[1]['recipeCategory']
    
def fetch_nutrients(recipe_json, regex, nutrient_name):
    # Extract name and strips whitespace
    nutrition = recipe_json[1].get('nutrition')
    nutrient = nutrition.get(nutrient_name) if nutrition is not None else None
    return regex.search(nutrient).group() if nutrient is not None else None 

def fetch_rating_score(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingValue')

def fetch_number_of_ratings(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingCount')

def fetch_reviews_count(soup, regex):
    reviews_count = soup.find('a',  class_="ugc-ratings-link ugc-reviews-link")
    return regex.search(reviews_count.text).group() if reviews_count is not None else None

def fetch_reviews(recipe_json):
    reviews_list = recipe_json[1]['review']
    return [i['reviewBody'] for i in reviews_list]

def fetch_photo_count(soup, regex):
    photos = soup.find('a',  class_="ugc-ratings-link ugc-photos-link")
    return regex.search(photos.text).group() if photos is not None else None

def mise_en_place(soup_tuple, regex=re.compile("\d+.\d*")):
    
    link = soup_tuple[0]
    soup = soup_tuple[1]
    
    recipe_json = get_recipe_json(soup)

    # Recipe name 
    recipe_name = fetch_title(recipe_json)
    
    # Ingredients list
    ingredients_list = fetch_ingredients(recipe_json)
    
    # Categories list
    categories_list = fetch_categories(recipe_json)
    
    # Description
    description = fetch_description(recipe_json)
    
    # Steps
    steps = fetch_steps(recipe_json)
    
    # Nutritional info
    # Calories
    cal = fetch_nutrients(recipe_json, regex, nutrient_name="calories")
    # Fat
    fat = fetch_nutrients(recipe_json, regex, nutrient_name="fatContent")
    # Carbs
    carb = fetch_nutrients(recipe_json, regex, nutrient_name="carbohydrateContent")
    # Protein
    prot = fetch_nutrients(recipe_json, regex, nutrient_name="proteinContent")
    # Cholesterol
    chol = fetch_nutrients(recipe_json, regex, nutrient_name="cholesterolContent")
    # Sodium
    sod = fetch_nutrients(recipe_json, regex, nutrient_name="sodiumContent")
    # Fiber
    fiber = fetch_nutrients(recipe_json, regex, nutrient_name="fiberContent")
    # Saturated Fat
    saturated_fat = fetch_nutrients(recipe_json, regex, nutrient_name="saturatedFatContent")
    # Sugar
    sugar = fetch_nutrients(recipe_json, regex, nutrient_name="sugarContent")
    # Trans Fat
    trans_fat = fetch_nutrients(recipe_json, regex, nutrient_name="transFatContent")
    # Trans Fat
    unsaturated_fat = fetch_nutrients(recipe_json, regex, nutrient_name="unsaturatedFatContent")
    
    # Prep Time
    prep_time = fetch_prep_time(recipe_json)
    prep_time = prep_time if prep_time is not None else prep_time 
    
    
    # Rating
    number_of_ratings = fetch_number_of_ratings(recipe_json)
    rating = fetch_rating_score(recipe_json)

    # Reviews
    num_reviews = fetch_reviews_count(soup, regex)
    reviews = fetch_reviews(recipe_json)
    
    # Photos
    photos = fetch_photo_count(soup, regex)
    
    
    return {"link":link, "recipe_name":recipe_name,"description":description, "ingredients_list":ingredients_list,
                "categories_list":categories_list,"calories":cal,"fat":fat,"carbs":carb, "protein":prot,
                "cholesterol":chol,"sodium":sod, "saturated_fat":saturated_fat, "sugar":sugar, "fiber":fiber, 
                "trans_fat":trans_fat, "unsaturated_fat":unsaturated_fat,"prep_time":prep_time,
                "ratings_number":number_of_ratings, "rating_score":rating, "reviews_number": num_reviews,
                "reviews": reviews, "photos_number":photos, "steps":steps}


In [2]:
def create_recipe_df(filename):
    print('Reading file..')
    pages_dict = create_dict_from_pickle("datasets/pickles/recipes_{}.p".format(NAME))
    print('Filtering old recipes pages')
    recipes_dict = filter_old_html(pages_dict)
    print('Filtering non recipes pages')
    recipes_dict = filter_non_recipes(recipes_dict)
    print('Creating soup dictionary')
    soups_dict = dict(map(make_soup, recipes_dict.items()))
    print('Extracting recipes info')
    list_recipes_dict = list(map(mise_en_place, soups_dict.items()))
    print('Creating dataframe')
    return pd.DataFrame(list_recipes_dict)

In [3]:
import os
os.chdir('/media/juan/DATA/')

In [4]:
NAME = "old_part5_2020_12"

In [5]:
df = create_recipe_df(NAME)

Reading file..
Filtering old recipes pages
Filtering non recipes pages
Creating soup dictionary
Extracting recipes info
Creating dataframe


In [6]:
df.shape

(661, 23)

In [7]:
os.chdir('/home/juan/ds_projects/recipes')

In [8]:
df.to_csv("datasets/dataframes/recipe_df_{}_new.csv".format(NAME),index=False)