In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import json

def create_dataframe_from_pickle(filename):
    _dict = pd.read_pickle(filename)
    df = pd.DataFrame(list(_dict.items()), columns=['link', 'html'])
    return df

def filter_old_html(df, pattern='<!doctype html>'):
    return df[df.html.str.startswith(pattern)]

def filter_non_recipes(df, pattern="www.allrecipes.com/(recipes)|(article)|(gallery)"):
    df = df[~(df.link=='https://www.allrecipes.com')]
    return df[~df.link.str.contains(pattern)]
    
def make_soup(response):
    return BeautifulSoup(response,"html.parser")

def get_recipe_json(soup):
    return json.loads(soup.find('script',  type="application/ld+json").text)

def fetch_title(recipe_json):
    return recipe_json[1]['name']

def fetch_ingredients(recipe_json):
    return recipe_json[1]['recipeIngredient']

def fetch_steps(recipe_json):
     # Ingredients list
    steps_list = recipe_json[1]['recipeInstructions']
    return [i['text'] for i in steps_list]

def fetch_description(recipe_json):
    return recipe_json[1]['description']

def fetch_prep_time(recipe_json):
    return recipe_json[1]['totalTime']

def fetch_categories(recipe_json):
    return recipe_json[1]['recipeCategory']
    
def fetch_nutrients(recipe_json, regex, nutrient_name):
    # Extract name and strips whitespace
    nutrition = recipe_json[1].get('nutrition')
    nutrient = nutrition.get(nutrient_name) if nutrition is not None else None
    return regex.search(nutrient).group() if nutrient is not None else None 

def fetch_rating_score(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingValue')

def fetch_number_of_ratings(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingCount')

def fetch_reviews_count(soup, regex):
    reviews_count = soup.find('a',  class_="ugc-ratings-link ugc-reviews-link")
    return regex.search(reviews_count.text).group() if reviews_count is not None else None

def fetch_reviews(recipe_json):
    reviews_list = recipe_json[1]['review']
    return [i['reviewBody'] for i in reviews_list]

def fetch_photo_count(soup, regex):
    photos = soup.find('a',  class_="ugc-ratings-link ugc-photos-link")
    return regex.search(photos.text).group() if photos is not None else None

def mise_en_place(soup, regex):
    
    recipe_json = get_recipe_json(soup)
    
    # Recipe name 
    recipe_name = fetch_title(recipe_json)
    
    # Ingredients list
    ingredients_list = fetch_ingredients(recipe_json)
    
    # Categories list
    categories_list = fetch_categories(recipe_json)
    
    # Description
    description = fetch_description(recipe_json)
    
    # Steps
    steps = fetch_steps(recipe_json)
    
    # Nutritional info
    # Calories
    cal = fetch_nutrients(recipe_json, regex, nutrient_name="calories")
    # Fat
    fat = fetch_nutrients(recipe_json, regex, nutrient_name="fatContent")
    # Carbs
    carb = fetch_nutrients(recipe_json, regex, nutrient_name="carbohydrateContent")
    # Protein
    prot = fetch_nutrients(recipe_json, regex, nutrient_name="proteinContent")
    # Cholesterol
    chol = fetch_nutrients(recipe_json, regex, nutrient_name="cholesterolContent")
    # Sodium
    sod = fetch_nutrients(recipe_json, regex, nutrient_name="sodiumContent")
    # Fiber
    fiber = fetch_nutrients(recipe_json, regex, nutrient_name="fiberContent")
    # Saturated Fat
    saturated_fat = fetch_nutrients(recipe_json, regex, nutrient_name="saturatedFatContent")
    # Sugar
    sugar = fetch_nutrients(recipe_json, regex, nutrient_name="sugarContent")
    # Trans Fat
    trans_fat = fetch_nutrients(recipe_json, regex, nutrient_name="transFatContent")
    # Trans Fat
    unsaturated_fat = fetch_nutrients(recipe_json, regex, nutrient_name="unsaturatedFatContent")
    
    # Prep Time
    prep_time = fetch_prep_time(recipe_json)
    prep_time = prep_time if prep_time is not None else prep_time 
    
    
    # Rating
    number_of_ratings = fetch_number_of_ratings(recipe_json)
    rating = fetch_rating_score(recipe_json)

    # Reviews
    num_reviews = fetch_reviews_count(soup, regex)
    reviews = fetch_reviews(recipe_json)
    
    # Photos
    photos = fetch_photo_count(soup, regex)
    
    # Made it
    made_it_number = None

    return [recipe_name,description, ingredients_list, categories_list,cal,fat,
            carb, prot, chol,sod, saturated_fat, sugar, trans_fat, unsaturated_fat,
            prep_time, number_of_ratings, rating, num_reviews,
            reviews, photos, steps, made_it_number]


In [24]:
NAME = "drinks_categories_2020_11"

In [25]:
df = create_dataframe_from_pickle("datasets/pickles/recipes_{}.p".format(NAME))
#df = filter_old_html(df)
df = filter_non_recipes(df)

In [6]:
df.shape

(441, 2)

In [26]:
df['soup'] = df.html.apply(lambda x: make_soup(x))
df = df.drop("html",axis=1)

In [27]:
#Quick fix (should deal in filter non recipes)
df = df[df.link.str.contains('https://www.allrecipes.com/recipe')]

In [17]:
df.shape

(440, 2)

In [28]:
get_recipe_json(df.loc[1,'soup'])[1].get('nutrition')

{'@type': 'NutritionInformation',
 'calories': '339.2 calories',
 'carbohydrateContent': '41.6 g',
 'cholesterolContent': '57 mg',
 'fatContent': '18.2 g',
 'fiberContent': '3.7 g',
 'proteinContent': '4.1 g',
 'saturatedFatContent': '11.4 g',
 'servingSize': None,
 'sodiumContent': '84.6 mg',
 'sugarContent': '2.1 g',
 'transFatContent': None,
 'unsaturatedFatContent': None}

In [23]:
get_recipe_json(df.loc[1,'soup'])[1].get('nutrition')

{'@type': 'NutritionInformation',
 'calories': '372.1 calories',
 'carbohydrateContent': '18.5 g',
 'cholesterolContent': '98 mg',
 'fatContent': '24.7 g',
 'fiberContent': '1 g',
 'proteinContent': '18.2 g',
 'saturatedFatContent': '9.9 g',
 'servingSize': None,
 'sodiumContent': '334.6 mg',
 'sugarContent': None,
 'transFatContent': None,
 'unsaturatedFatContent': None}

In [30]:
REGEX=re.compile("\d+.\d*")
df["feature_list"] = df.soup.apply(lambda x: mise_en_place(x, REGEX))

In [31]:
df = df.drop("soup",axis=1)

In [32]:
df["recipe_name"] = df.feature_list.apply(lambda x: x[0])
df["description"] = df.feature_list.apply(lambda x: x[1])
df["ingredients_list"] = df.feature_list.apply(lambda x: x[2])
df["categories_list"] = df.feature_list.apply(lambda x: x[3])
df["calories"] = df.feature_list.apply(lambda x: x[4])
df["fat"] = df.feature_list.apply(lambda x: x[5])
df["carbs"] = df.feature_list.apply(lambda x: x[6])
df["protein"] = df.feature_list.apply(lambda x: x[7])
df["cholesterol"] = df.feature_list.apply(lambda x: x[8])
df["sodium"] = df.feature_list.apply(lambda x: x[9])
df["saturated_fat"] = df.feature_list.apply(lambda x: x[10])
df["sugar"] = df.feature_list.apply(lambda x: x[11])
df["trans_fat"] = df.feature_list.apply(lambda x: x[12])
df["unsaturated_fat"] = df.feature_list.apply(lambda x: x[13])
df["prep_time"] = df.feature_list.apply(lambda x: x[14])
df["number_of_rating"] = df.feature_list.apply(lambda x: x[15])
df["rating_score"] = df.feature_list.apply(lambda x: x[16])
df["num_reviews"] = df.feature_list.apply(lambda x: x[17])
df["reviews"] = df.feature_list.apply(lambda x: x[18])
df["photos"] = df.feature_list.apply(lambda x: x[19])
df["steps"] = df.feature_list.apply(lambda x: x[20])
df["made_it"] = df.feature_list.apply(lambda x: x[21])

In [33]:
df.to_csv("datasets/dataframes/recipe_df_{}_new.csv".format(NAME),index=False)

In [34]:
df.iloc[2:5,6:18]

Unnamed: 0,calories,fat,carbs,protein,cholesterol,sodium,saturated_fat,sugar,trans_fat,unsaturated_fat,prep_time,number_of_rating
3,21.0,0.0,5.0,0.4,0,302.1,0.0,3.4,,,P0DT0H11M,232.0
4,111.8,6.7,4.8,7.5,191,1395.7,1.7,1.4,,,P0DT0H10M,703.0
5,30.2,1.2,5.9,0.5,0,1.1,0.8,1.0,,,P0DT0H5M,51.0


In [13]:
df.head(10)

Unnamed: 0,link,feature_list,recipe_name,description,ingredients_list,categories_list,calories,fat,carbs,protein,...,trans_fat,unsaturated_fat,prep_time,number_of_rating,rating_score,num_reviews,reviews,photos,steps,made_it
1,https://www.allrecipes.com/recipe/241038/micro...,"[Microwave Chocolate Mug Cake, This is my own ...",Microwave Chocolate Mug Cake,This is my own version of the chocolate microw...,"[¼ cup all-purpose flour, ¼ cup white sugar, 2...","[Dessert Recipes, Cakes, Chocolate Cake Recipes]",603.1,30.4,82.0,6.9,...,,,P0DT0H7M,1527.0,4.489194,1244,[This cake helped me out of a dire situation t...,397,"[Mix flour, sugar, cocoa powder, baking soda, ...",
2,https://www.allrecipes.com/recipe/22180/waffle...,"[Waffles I, You can keep the batter covered in...",Waffles I,You can keep the batter covered in the fridge ...,"[2 eggs, 2 cups all-purpose flour, 1 ¾ cups mi...","[Breakfast and Brunch Recipes, Waffle Recipes]",382.0,21.6,38.0,8.7,...,,,P0DT0H20M,4894.0,4.50613,3668,[5+++++ I love this recipe. I made some health...,622,[Preheat waffle iron. Beat eggs in large bowl ...,
3,https://www.allrecipes.com/recipe/245534/roast...,"[Roast Spatchcock Turkey, This roasting method...",Roast Spatchcock Turkey,"This roasting method results in the juiciest, ...","[1 (10 pound) whole turkey, ½ cup olive oil, 1...","[Meat and Poultry Recipes, Turkey, Whole Turke...",777.1,42.7,0.3,92.0,...,,,P0DT1H70M,40.0,4.875,31,[I love making a turkey this way! I place my t...,11,[Preheat oven to 350 degrees F (175 degrees C)...,
4,https://www.allrecipes.com/recipe/17209/absolu...,"[Absolutely the Best Chocolate Chip Cookies, T...",Absolutely the Best Chocolate Chip Cookies,This is a secret family recipe for chocolate c...,"[1 cup butter flavored shortening, ¾ cup white...","[Dessert Recipes, Cookie Recipes, Drop Cookie ...",241.2,13.7,28.1,2.7,...,,,P0DT0H20M,1488.0,4.511425,1145,[Great cookie. I'm very strict about using on...,131,[Preheat oven to 350 degrees F (175 degrees C)...,
5,https://www.allrecipes.com/recipe/238691/simpl...,"[Simple Macaroni and Cheese, A very quick and ...",Simple Macaroni and Cheese,A very quick and easy fix to a tasty side-dish...,"[1 (8 ounce) box elbow macaroni, ¼ cup butter,...","[Main Dish Recipes, Pasta, Macaroni and Cheese...",630.2,33.6,55.0,26.5,...,,,P0DT0H30M,781.0,4.408451,597,[Made it almost exactly like the recipe stated...,202,[Bring a large pot of lightly salted water to ...,
6,https://www.allrecipes.com/recipe/92462/slow-c...,"[Slow Cooker Texas Pulled Pork, Slow cooked, T...",Slow Cooker Texas Pulled Pork,"Slow cooked, Texas-style pulled pork that is s...","[1 teaspoon vegetable oil, 1 (4 pound) pork sh...","[Main Dish Recipes, Pork, Pulled Pork Recipes]",527.7,23.2,45.5,31.9,...,,,P0DT5H15M,2926.0,4.527683,2087,[Perfect! Followed the recipe to a T and it w...,210,[Pour the vegetable oil into the bottom of a s...,
7,https://www.allrecipes.com/recipe/76604/make-a...,"[Make-Ahead Turkey Gravy, I discovered this ex...",Make-Ahead Turkey Gravy,I discovered this excellent Thanksgiving gravy...,"[6 turkey wings, 2 medium onions, peeled and q...","[Side Dish, Sauces and Condiments Recipes, Gra...",58.9,2.8,3.2,4.9,...,,,P0DT2H30M,434.0,4.679724,359,[I'm only giving this a 4 but I think it could...,21,[Preheat oven to 400 degrees F (200 degrees C)...,
8,https://www.allrecipes.com/recipe/12409/apple-...,"[Apple Crisp II, A simple dessert that's great...",Apple Crisp II,A simple dessert that's great served with ice ...,"[10 cups all-purpose apples, peeled, cored and...","[Dessert Recipes, Fruit Dessert Recipes, Apple...",316.0,8.4,60.5,2.4,...,,,P0DT0H80M,7919.0,4.696931,5907,"[Oh my gosh, this is delicious! I have never h...",956,[Preheat oven to 350 degrees F (175 degree C)....,
9,https://www.allrecipes.com/recipe/244265/argen...,"[Argentinian Beef Empanadas, My family loves t...",Argentinian Beef Empanadas,My family loves these juicy meat pies with rai...,"[½ pound ground beef, 3 tablespoons butter, 2 ...","[Main Dish Recipes, Savory Pie Recipes, Beef P...",822.7,57.2,61.4,16.6,...,,,P0DT1H25M,25.0,4.16,19,[These are wonderful and so easy to make. I le...,12,"[Set racks in upper and lower thirds of oven, ...",
10,https://www.allrecipes.com/recipe/214651/donut...,"[Donut Muffins, Mini muffins that taste just l...",Donut Muffins,Mini muffins that taste just like cinnamon sug...,"[½ cup white sugar, ¼ cup margarine, melted, ¾...","[Bread, Quick Bread Recipes, Muffin Recipes]",88.0,3.9,12.8,0.8,...,,,P0DT0H40M,2799.0,4.492319,2148,[I'm curious about a couple of things about th...,490,[Preheat oven to 375 degrees F (190 degrees C)...,
