In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import json

def create_dataframe_from_pickle(filename):
    _dict = pd.read_pickle(filename)
    df = pd.DataFrame(list(_dict.items()), columns=['link', 'html'])
    return df

def filter_old_html(df, pattern='<!doctype html>'):
    return df[df.html.str.startswith(pattern)]

def filter_non_recipes(df, pattern="www.allrecipes.com/recipes"):
    return df[~df.link.str.contains(pattern)]
    
def make_soup(response):
    return BeautifulSoup(response,"html.parser")

def get_recipe_json(soup):
    return json.loads(soup.find('script',  type="application/ld+json").text)

def fetch_title(recipe_json):
    return recipe_json[1]['name']

def fetch_ingredients(recipe_json):
    return recipe_json[1]['recipeIngredient']

def fetch_steps(recipe_json):
     # Ingredients list
    steps_list = recipe_json[1]['recipeInstructions']
    return [i['text'] for i in steps_list]

def fetch_description(recipe_json):
    return recipe_json[1]['description']

def fetch_prep_time(recipe_json):
    return recipe_json[1]['totalTime']

def fetch_categories(recipe_json):
    return recipe_json[1]['recipeCategory']
    
def fetch_nutrients(recipe_json, regex, nutrient_name):
    # Extract name and strips whitespace
    nutrition = recipe_json[1].get('nutrition')
    nutrient = nutrition.get(nutrient_name) if nutrition is not None else None
    return regex.search(nutrient).group() if nutrient is not None else None 

def fetch_rating_score(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingValue')

def fetch_number_of_ratings(recipe_json):
    return recipe_json[1]['aggregateRating'].get('ratingCount')

def fetch_reviews_count(soup, regex):
    reviews_count = soup.find('a',  class_="ugc-ratings-link ugc-reviews-link")
    return regex.search(reviews_count.text).group() if reviews_count is not None else None

def fetch_reviews(recipe_json):
    reviews_list = recipe_json[1]['review']
    return [i['reviewBody'] for i in reviews_list]

def fetch_photo_count(soup, regex):
    photos = soup.find('a',  class_="ugc-ratings-link ugc-photos-link")
    return regex.search(photos.text).group() if photos is not None else None

def mise_en_place(soup, regex):
    
    recipe_json = get_recipe_json(soup)
    
    # Recipe name 
    recipe_name = fetch_title(recipe_json)
    print(recipe_name)
    
    # Ingredients list
    ingredients_list = fetch_ingredients(recipe_json)
    
    # Categories list
    categories_list = fetch_categories(recipe_json)
    
    # Description
    description = fetch_description(recipe_json)
    
    # Steps
    steps = fetch_steps(recipe_json)
    
    # Nutritional info
    # Calories
    cal=fetch_nutrients(recipe_json, regex, nutrient_name="calories")
    # Fat
    fat=fetch_nutrients(recipe_json, regex, nutrient_name="fatContent")
    # Carbs
    carb=fetch_nutrients(recipe_json, regex, nutrient_name="carbohydrateContent")
    # Protein
    prot=fetch_nutrients(recipe_json, regex, nutrient_name="proteinContent")
    # Cholesterol
    chol=fetch_nutrients(recipe_json, regex, nutrient_name="cholesterolContent")
    # Sodium
    sod=fetch_nutrients(recipe_json, regex, nutrient_name="sodiumContent")    
    
    # Prep Time
    prep_time = fetch_prep_time(recipe_json)
    prep_time = prep_time if prep_time is not None else prep_time 
    
    
    # Rating
    number_of_ratings = fetch_number_of_ratings(recipe_json)
    rating = fetch_rating_score(recipe_json)

    # Reviews
    num_reviews = fetch_reviews_count(soup, regex)
    reviews = fetch_reviews(recipe_json)
    
    # Photos
    photos = fetch_photo_count(soup, regex)
    
    # Made it
    made_it_number = None

    return [recipe_name,description, ingredients_list, categories_list,cal,fat,
            carb,prot,chol,sod,prep_time, number_of_ratings, rating, num_reviews,
            reviews, photos, steps, made_it_number]


In [2]:
NAME = "main_dish_main_categories"

In [3]:
df = create_dataframe_from_pickle("datasets/pickles/recipes_{}.p".format(NAME))
df = filter_old_html(df)
df = filter_non_recipes(df)

In [4]:
df.shape

(1138, 2)

In [5]:
df['soup'] = df.html.apply(lambda x: make_soup(x))
df = df.drop("html",axis=1)

In [6]:
#Quick fix (should deal in filter non recipes)
df = df[~(df.link=='https://www.allrecipes.com')]
df = df[~(df.link.str.contains('https://www.allrecipes.com/article'))]
df = df[~(df.link.str.contains('https://www.allrecipes.com/gallery'))]
df = df[df.link.str.contains('https://www.allrecipes.com/recipe')]

In [7]:
df.shape

(1136, 2)

In [8]:
REGEX=re.compile("\d+.\d*")
df["feature_list"] = df.soup.apply(lambda x: mise_en_place(x, REGEX))

Rosemary Ranch Chicken Kabobs
Salsa Chicken
Slow Cooker Pulled Pork
Grilled Salmon I
Maple Salmon
Chicken Marsala
Garlic Chicken
Baked Pork Chops I
Zesty Slow Cooker Chicken Barbecue
Chicken Cordon Bleu II
Sloppy Joes II
Baked Teriyaki Chicken
Easy Meatloaf
Baked Ziti I
Brown Sugar Meatloaf
Slow Cooker Chicken and Dumplings
Awesome Slow Cooker Pot Roast
Broiled Tilapia Parmesan
Chicken Pot Pie IX
World's Best Lasagna
Copycat Fried Chicken Sandwich
Keto Turkey-Stuffed Peppers
Beef Stroganoff with Ground Beef
Gemelli Pasta with Roasted Pumpkin and Pancetta
Sweet Potato and Venison Shepherd's Pie
Al's Burmese Chicken Curry
Spicy Pork Tenderloin with Apples and Sweet Potatoes
Cardamom Maple Salmon
Festival-Style Grilled Italian Sausage Sandwiches
Pasta Lasagna
Taco Lasagna
Easy Lasagna I
Baked Spaghetti Squash Lasagna Style
Lasagna Alfredo Roll Ups
Alysia's Basic Meat Lasagna
Mexican Lasagna
No-Noodle Zucchini Lasagna
Spinach Lasagna III
Creamy Chicken Lasagna
Hearty Vegetable Lasagna
Easy

High Temperature Eye-of-Round Roast
Mini Meatloaves
Slow Cooker Salisbury Steak
Rempel Family Meatloaf
Italian Spaghetti Sauce with Meatballs
Hamburger Steak with Onions and Gravy
Meatball Nirvana
Salisbury Steak
Crispy Orange Beef
Blue Cheese Beef Tenderloin
Beef Tenderloin with Ginger-Shiitake Brown Butter
Beef Stroganoff III
Beef Yakitori
Pork Chops with Raspberry Sauce
London Broil II
Beef Bulgogi
Slow Cooker Pork Chops II
Oven-Fried Pork Chops
Gravy Baked Pork Chops
Sage Pork Chops
Pork Chops for the Slow Cooker
Savory Garlic Marinated Steaks
Filet Mignon with Rich Balsamic Glaze
Sour Cream Pork Chops
Lamb Chops with Balsamic Reduction
Marinated Flank Steak
Famous Pork Chops
Mushroom Pork Chops
Caramel Apple Pork Chops
Italian Breaded Pork Chops
Marinated Baked Pork Chops
Cuban Marinated Steak
Lamb Chops in Duck Sauce
Chef John's Smothered Pork Chops
Sweet Maple Pork Chops
Brandy Flamed Peppercorn Steak
Vietnamese Aromatic Lamb Chops
Veal Chop with Portabello Mushrooms
Perfect Por

Tender Breaded Turkey Cutlets
Marinated Turkey Breast
The Greatest Grilled Turkey
Chef John's Turkey Sloppy Joes
Herb-Glazed Roasted Turkey
Roasted Turkey Legs
Delicious Turkey Burgers
Turkey in a Bag
Unbelievably Awesome Barbeque Chicken Pizza
Chicken Garlic Pizza
Vegetable Pizza I
Slow Cooker Pizza
Chicken Pesto Pizza
Pizza Without the Red Sauce
BBQ Chicken Pizza II
Bubble Pizza
BBQ Chicken Pizza
BBQ Chicken Pizza I
Pizza On The Grill I
Chicago-Style Pan Pizza
Four Cheese Margherita Pizza
Veggie Pizza
Whole Wheat and Honey Pizza Dough
Jimmy's Mexican Pizza
Garden Veggie Pizza Squares
Pizza Pinwheels
Gourmet Chicken Pizza
Buffalo Style Chicken Pizza
Arugula and Hummus Mini Pizzas
D's Taco Pizza
Greek Pizza
Jan's Jalapeno Popper Pizza
Mediterranean Pesto Pizza
Barbeque Chicken Grilled Pizza
Campfire Pepperoni Pizza
Blue Cheese, Walnut, and Pear Pizza
Gourmet Thai Chicken Pizza
Mouse's Macaroni and Cheese
Red Lentil Curry
Vegetarian Chickpea Sandwich Filling
Delicious Black Bean Burrito

Curry in a Hurry
Kerala Chicken Curry
Cholay (Curried Chickpeas)
Indian Chicken Curry (Murgh Kari)
Golden Pork Chops
Modenese Pork Chops
Grilled Brown Sugar Pork Chops
Awesome Honey Pecan Pork Chops
Cola Pork Chops
Potato and Pork Bake
Chesapeake Bay Pork Chops
Apple Cider Sauce and Pork Loin Chops
Grilled Asian Ginger Pork Chops
San Francisco Pork Chops
Creamy Herbed Pork Chops
Mom's Best Pork Chops
Cheesy Pork Chops with Spicy Apples
Make Ahead Lunch Wraps
Best Tuna Melt (New Jersey Diner Style)
Grilled Cheese Sandwich
Slow Cooked Corned Beef for Sandwiches
French Dip Sandwiches
Philly Cheesesteak Sandwich with Garlic Mayo
Easy French Dip Sandwiches
Neat Sloppy Joes
Meatball Sandwich
California Grilled Veggie Sandwich
Cucumber Sandwiches III
Classic Cuban Midnight (Medianoche) Sandwich
Vietnamese Sandwich
Sensational Steak Sandwich
Super-Easy Pulled Pork Sandwiches
Chicago-Inspired Italian Beef Sandwich
Grilled Chicken Salad Sandwich
Lorraine's Club Sandwich
Tasty Baked Meatballs
Haz

In [9]:
df = df.drop("soup",axis=1)

In [10]:
df["recipe_name"] = df.feature_list.apply(lambda x: x[0])
df["description"] = df.feature_list.apply(lambda x: x[1])
df["ingredients_list"] = df.feature_list.apply(lambda x: x[2])
df["categories_list"] = df.feature_list.apply(lambda x: x[3])
df["calories"] = df.feature_list.apply(lambda x: x[4])
df["fat"] = df.feature_list.apply(lambda x: x[5])
df["carbs"] = df.feature_list.apply(lambda x: x[6])
df["protein"] = df.feature_list.apply(lambda x: x[7])
df["cholesterol"] = df.feature_list.apply(lambda x: x[8])
df["sodium"] = df.feature_list.apply(lambda x: x[9])
df["prep_time"] = df.feature_list.apply(lambda x: x[10])
df["number_of_rating"] = df.feature_list.apply(lambda x: x[11])
df["rating_score"] = df.feature_list.apply(lambda x: x[12])
df["num_reviews"] = df.feature_list.apply(lambda x: x[13])
df["reviews"] = df.feature_list.apply(lambda x: x[14])
df["photos"] = df.feature_list.apply(lambda x: x[15])
df["steps"] = df.feature_list.apply(lambda x: x[16])
df["made_it"] = df.feature_list.apply(lambda x: x[17])

In [11]:
df.to_csv("datasets/dataframes/recipe_df_{}_new.csv".format(NAME),index=False)

In [12]:
df.head(10)

Unnamed: 0,link,feature_list,recipe_name,description,ingredients_list,categories_list,calories,fat,carbs,protein,cholesterol,sodium,prep_time,number_of_rating,rating_score,num_reviews,reviews,photos,steps,made_it
1,https://www.allrecipes.com/recipe/64513/rosema...,"[Rosemary Ranch Chicken Kabobs, This rosemary ...",Rosemary Ranch Chicken Kabobs,This rosemary ranch chicken recipe is so delic...,"[½ cup olive oil, ½ cup ranch dressing, 3 tabl...","[Meat and Poultry, Chicken, Chicken Breasts]",377.7,30.7,4.8,19.9,59.2,1097.2,P0DT0H60M,4633.0,4.711202,3283,[These were HEAVENLY! My husband and I would ...,334,"[In a medium bowl, stir together the olive oil...",
2,https://www.allrecipes.com/recipe/16700/salsa-...,"[Salsa Chicken, Someone gave me this recipe a ...",Salsa Chicken,Someone gave me this recipe a few years back a...,"[4 skinless, boneless chicken breast halves,...","[World Cuisine, Latin American, Mexican]",286.7,12.4,6.8,35.5,101.3,863.1,P0DT0H45M,4667.0,4.626955,3252,[Re-write of old review (again) - \nI've been ...,482,[Preheat oven to 375 degrees F (190 degrees C...,
3,https://www.allrecipes.com/recipe/141678/slow-...,"[Slow Cooker Pulled Pork, Pork simmered in roo...",Slow Cooker Pulled Pork,Pork simmered in root beer makes all the diffe...,"[1 (2 pound) pork tenderloin, 1 (12 fluid ounc...","[Main Dishes, Pork, Pulled Pork]",335.3,5.0,49.4,21.2,49.1,990.1,P0DT7H10M,5027.0,4.60931,3585,[Here's the deal folks! Most would agree that...,213,[Place the pork tenderloin in a slow cooker; p...,
4,https://www.allrecipes.com/recipe/12720/grille...,"[Grilled Salmon I, A simple soy sauce and brow...",Grilled Salmon I,"A simple soy sauce and brown sugar marinade, w...","[1 ½ pounds salmon fillets, ½ teaspoon lemon p...","[Main Dishes, Seafood Main Dishes, Salmon, Sal...",317.9,20.1,13.2,20.5,55.8,1091.8,P0DT2H31M,5411.0,4.763999,3693,[I have one thing that I do every time I cook ...,364,"[Season salmon fillets with lemon pepper, garl...",
5,https://www.allrecipes.com/recipe/51283/maple-...,"[Maple Salmon, This is the best and most delic...",Maple Salmon,This is the best and most delicious salmon rec...,"[¼ cup maple syrup, 2 tablespoons soy sauce, 1...","[Main Dish, Seafood, Salmon, Baked Salmon]",265.0,12.4,14.1,23.2,67.0,633.0,PT1H,5550.0,4.672252,3666,[Hi - this is Starflower and I submitted this ...,459,"[In a small bowl, mix the maple syrup, soy sau...",
6,https://www.allrecipes.com/recipe/8887/chicken...,"[Chicken Marsala, A delicious, classic chicken...",Chicken Marsala,"A delicious, classic chicken dish -- lightly c...","[¼ cup all-purpose flour for coating, ½ teaspo...","[World Cuisine, European, Italian]",447.7,26.6,13.3,28.8,99.0,543.0,P0DT0H30M,5584.0,4.542443,3891,[This recipe was really good. I modified it a ...,452,"[In a shallow dish or bowl, mix together the f...",
7,https://www.allrecipes.com/recipe/8652/garlic-...,"[Garlic Chicken, Simple to make, just dip and ...",Garlic Chicken,"Simple to make, just dip and bake! Garlicky go...","[¼ cup olive oil, 2 cloves garlic, crushed, ¼ ...","[Meat and Poultry, Chicken, Chicken Breasts]",300.4,16.8,5.7,30.3,72.8,261.2,P0DT0H55M,6110.0,4.573159,4234,[I've made this 8 times now. The first 7 times...,320,[Preheat oven to 425 degrees F (220 degrees C)...,
8,https://www.allrecipes.com/recipe/16348/baked-...,"[Baked Pork Chops I, A pork chop recipe that i...",Baked Pork Chops I,A pork chop recipe that is quick and easy. You...,"[6 pork chops, 1 teaspoon garlic powder, 1 t...","[Main Dishes, Pork, Pork Chops, Baked]",457.4,19.9,36.0,29.9,128.5,1142.4,P0DT1H60M,6144.0,4.551107,4594,[I took some advice from pervious reviewers an...,404,[Preheat oven to 350 degrees F (175 degrees C)...,
9,https://www.allrecipes.com/recipe/65896/zesty-...,"[Zesty Slow Cooker Chicken Barbecue, Use your ...",Zesty Slow Cooker Chicken Barbecue,Use your slow cooker to prepare this great twi...,"[6 frozen skinless, boneless chicken breast ...","[Meat and Poultry, Chicken, Chicken Breasts]",299.5,8.1,32.4,23.0,60.8,1058.4,P0DT4H10M,6330.0,4.576461,4358,"[7/27/06 My girlfriend made this, then called...",342,"[Place chicken in a slow cooker. In a bowl, mi...",
10,https://www.allrecipes.com/recipe/8669/chicken...,"[Chicken Cordon Bleu II, 'Cordon Bleu' is a Fr...",Chicken Cordon Bleu II,"'Cordon Bleu' is a French term, literally tran...","[6 skinless, boneless chicken breast halves,...","[World Cuisine, European, French]",584.3,40.9,7.7,41.6,195.3,655.3,P0DT0H60M,6901.0,4.779452,4762,[This is very good -- but I'd recommend a few ...,638,[Pound chicken breasts if they are too thick. ...,


In [None]:
df.shape