In [3]:
import json
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

def create_dataframe_from_pickle(filename):
    _dict = pd.read_pickle(filename)
    df = pd.DataFrame(list(_dict.items()), columns=['link', 'html'])
    return df

def filter_old_html(df, pattern='<!doctype html>'):
    return df[~df.html.str.startswith(pattern)]

def filter_non_recipes(df, pattern="www.allrecipes.com/recipes"):
    return df[~df.link.str.contains(pattern)]
    
def make_soup(response):
    return BeautifulSoup(response,"html.parser")

def fetch_title(soup):
    # Recipe title
    recipe_name = soup.title.text
    return recipe_name

def fetch_ingredients(soup):
     # Ingredients list
    ingredients_list = [ing.text for ing in soup.find_all(itemprop="recipeIngredient")]
    # Filter empty elements
    ingredients_list = list(filter(None, ingredients_list))
    return ingredients_list

def fetch_steps(soup):
     # Ingredients list
    steps_list = [step.text for step in soup.find_all(class_="recipe-directions__list--item")]
    # Filter empty elements
    steps_list = list(filter(None, steps_list))
    return steps_list

def fetch_description(soup):
    return soup.find(itemprop="description").text

def fetch_prep_time(soup):
    # Prep Time
    prep_time = soup.find('span',class_="ready-in-time")
    # Extract time if possible
    return prep_time.text if prep_time is not None else None

def fetch_categories(soup):
    # Categories list. Strips withspace
    categories_list = [ing.text.strip() for ing in soup.find_all(itemprop="name")]
    # Filter empty elements
    categories_list = list(filter(None, categories_list))
    return categories_list

def fetch_calories(soup,regex):
    # Extracts the numbers of calories
    cal = soup.find(itemprop="calories")
    return regex.search(cal.text.strip()).group() if cal is not None else None 
    
def fetch_nutrients(soup, nutrient_name):
    # Extract name and strips whitespace
    nutrient = soup.find(itemprop=nutrient_name) 
    return nutrient.text.strip() if nutrient is not None else None 

def fetch_number_made_it(soup):
    '''1000 is expressed as 1k'''
    made_it_text=[s.text for s in soup.find_all("span") if "made it" in s.text][1]
    return made_it_text.split()[0]

def fetch_social_data(soup, item):
    return soup.find('meta',itemprop=item)['content']

def fetch_number_of_ratings(soup, regex=re.compile('(\d+) Ratings')):
    '''CHECK find method vs find_all'''
    number_ratings = soup.find('h4',class_="helpful-header")
    number_ratings = regex.search(number_ratings.text).group(1) if number_ratings is not None else number_ratings
    return number_ratings

def fetch_reviews(soup):
    reviews_list = soup.find_all('script')
    reviews_list = [i.text for i in reviews_list if "reviewsInitialSet" in i.text][0]
    reviews_list = reviews_list.replace("\r\n    var reviewsInitialSet = ", "")
    reviews_list = reviews_list.replace(";\r\n", "")
    return [i['text'] for i in json.loads(reviews_list)["reviews"]["Reviews"]]

def fetch_photo_count(soup, regex):
    return regex.search(soup.find('span',class_="picture-count-link").text).group()

def mise_en_place(soup, regex):
    
    # Ingredients list
    ingredients_list = fetch_ingredients(soup)
    
    # Categories list
    categories_list = fetch_categories(soup)
    
    # Recipe name
    recipe_name = categories_list[-1]
    
    # Description
    description = fetch_description(soup)
    
    # Steps
    steps = fetch_steps(soup)
    
    # Nutritional info
    # Calories
    cal=fetch_calories(soup,regex)
    # Fat
    fat=fetch_nutrients(soup, nutrient_name="fatContent")
    # Carbs
    carb=fetch_nutrients(soup, nutrient_name="carbohydrateContent")
    # Protein
    prot=fetch_nutrients(soup, nutrient_name="proteinContent")
    # Cholesterol
    chol=fetch_nutrients(soup, nutrient_name="cholesterolContent")
    # Sodium
    sod=fetch_nutrients(soup, nutrient_name="sodiumContent")    
    
    # Prep Time
    prep_time = soup.find('span',class_="ready-in-time")
    prep_time = prep_time.text if prep_time is not None else prep_time 
    
    #Made it
    made_it_number = fetch_number_made_it(soup)
    
    # Rating
    number_of_ratings = fetch_number_of_ratings(soup)
    rating = fetch_social_data(soup, item="ratingValue")

    # Reviews
    num_reviews = fetch_social_data(soup, item="reviewCount")
    reviews = fetch_reviews(soup)
    
    # Photos
    photos = fetch_photo_count(soup, regex)

    return [recipe_name,description, ingredients_list, categories_list,cal,fat,
            carb,prot,chol,sod,prep_time, number_of_ratings, rating, num_reviews,
            reviews, photos, steps, made_it_number]

### Agregar:
* cantidad de ratings LISTO
* review positiva
* review negativa
* Description LISTO
* Cantidad de personas que lo hicieron LISTO

In [83]:
NAME = "smoothies"

In [84]:
df = create_dataframe_from_pickle("datasets/pickles/recipes_{}.p".format(NAME))
df = filter_old_html(df)
df = filter_non_recipes(df)

In [85]:
df.shape

(1245, 2)

In [86]:
df['soup'] = df.html.apply(lambda x: make_soup(x))
df = df.drop("html",axis=1)

In [87]:
#Quick fix (should deal in filter non recipes)
df = df[~(df.link=='https://www.allrecipes.com')]

In [88]:
REGEX=re.compile("\d+")
df["feature_list"] = df.soup.apply(lambda x: mise_en_place(x, REGEX))

In [89]:
df = df.drop("soup",axis=1)

In [90]:
df["recipe_name"] = df.feature_list.apply(lambda x: x[0])
df["description"] = df.feature_list.apply(lambda x: x[1])
df["ingredients_list"] = df.feature_list.apply(lambda x: x[2])
df["categories_list"] = df.feature_list.apply(lambda x: x[3])
df["calories"] = df.feature_list.apply(lambda x: x[4])
df["fat"] = df.feature_list.apply(lambda x: x[5])
df["carbs"] = df.feature_list.apply(lambda x: x[6])
df["protein"] = df.feature_list.apply(lambda x: x[7])
df["cholesterol"] = df.feature_list.apply(lambda x: x[8])
df["sodium"] = df.feature_list.apply(lambda x: x[9])
df["prep_time"] = df.feature_list.apply(lambda x: x[10])
df["number_of_rating"] = df.feature_list.apply(lambda x: x[11])
df["rating_score"] = df.feature_list.apply(lambda x: x[12])
df["num_reviews"] = df.feature_list.apply(lambda x: x[13])
df["reviews"] = df.feature_list.apply(lambda x: x[14])
df["photos"] = df.feature_list.apply(lambda x: x[15])
df["steps"] = df.feature_list.apply(lambda x: x[16])
df["made_it"] = df.feature_list.apply(lambda x: x[17])

In [91]:
df.to_csv("datasets/dataframes/recipe_df_{}_old.csv".format(NAME),index=False)

In [92]:
df.shape

(1245, 20)

In [93]:
df.head()

Unnamed: 0,link,feature_list,recipe_name,description,ingredients_list,categories_list,calories,fat,carbs,protein,cholesterol,sodium,prep_time,number_of_rating,rating_score,num_reviews,reviews,photos,steps,made_it
1,https://www.allrecipes.com/recipe/23530/orange...,"[Orange Smoothie, \r\n""A great drink for orang...",Orange Smoothie,"\r\n""A great drink for orange lovers, and a fu...",[1 (6 ounce) can frozen orange juice concentra...,"[Home, Recipes, Drinks, Smoothies, Orange, Ora...",183,1.3,40.0,3.3,5,28,2 m,165.0,4.32,128,[This is good but it lacks the egg white that ...,13,"[In a blender, combine orange juice concentrat...",274
3,https://www.allrecipes.com/recipe/244943/banan...,"[Banana, Orange, and Ginger Smoothie, \r\n""Thi...","Banana, Orange, and Ginger Smoothie","\r\n""This is a nice fruity smoothie that I thr...","[1 orange, peeled, 1/2 banana, 3 ice cubes, 2 ...","[Home, Recipes, Breakfast and Brunch, Drinks, ...",176,2.1,34.6,7.1,7,89,5 m,,5.0,6,"[Delicious! I added pineapple juice, stevia, ...",6,"[Layer orange, banana, ice cubes, honey, and g...",30
5,https://www.allrecipes.com/recipe/241365/banan...,"[Banana, Avocado, and Spinach Smoothie, \r\n""Q...","Banana, Avocado, and Spinach Smoothie","\r\n""Quick and delicious smoothie that can be ...","[1 banana, sliced, 1/2 avocado, peeled and sli...","[Home, Recipes, Breakfast and Brunch, Drinks, ...",190,8.2,27.5,4.0,2,45,10 m,46.0,4.61,42,[I had spinach and avocado I needed to use bef...,15,"[Blend banana, avocado, spinach, milk, ice cub...",134
6,https://www.allrecipes.com/recipe/241611/avoca...,"[Avocado Banana Nut Smoothie, \r\n""This thick ...",Avocado Banana Nut Smoothie,"\r\n""This thick smoothie is not the most appea...","[1 cup almond milk, 1 avocado, peeled and pitt...","[Home, Recipes, Breakfast and Brunch, Drinks, ...",806,57.2,66.8,18.8,0,401,10 m,,4.0,1,[This thick smoothie packs plenty of flavor. B...,1,"[Blend almond milk, avocado, banana, peanut bu...",10
7,https://www.allrecipes.com/recipe/246908/avoca...,"[Avocado, Blueberry, Banana, and Chia Smoothie...","Avocado, Blueberry, Banana, and Chia Smoothie","\r\n""Don't let avocado scare you! Smooth, ligh...","[1 cup vanilla-flavored almond milk, 1 avocado...","[Home, Recipes, Breakfast and Brunch, Drinks, ...",643,35.3,85.6,8.6,0,179,5 m,,4.33,4,[It was okay not one of my favorites although ...,2,"[Combine almond milk, avocado, blueberries, ba...",10
