# This Notebook is used for generating the data of Meal planning for the new Millennium (MnM) problem

In [1]:
! pip install selenium

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m


## 1. Import all the necessary library

In [2]:
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as url 
import pickle
import json
import multiprocessing
import pandas as pd 
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

## 2. Define EP_Recipe class to store all the data.

In [3]:
class EP_Recipe():
    title           = None
    rating          = None
    time            = None
    price           = None
    calories        = None
    sodium          = None
    fat             = None
    protein         = None
    carbs           = None
    personal_rating = []
    ingredients = []
    url = None

    def get_title(self, page):
        return page.find('h1', {'class':'recipe-title font-bold h2-text primary-dark'}).text

    def rating_calculator(self, box):
        try:
            full_star = box.find_all('span', {'class':'icon full-star y-icon'})
            half_star = box.find_all('span', {'class':'icon half-star y-icon'})
            empty_star = box.find_all('span', {'class':'icon empty-star y-icon'})
            if (full_star != []) or (half_star != []) or (empty_star != []):
                return float(0.5*len(half_star)+len(full_star))
        except:
            return None

    def get_rating(self, page):
        try:
            rating_box = page.find('a', {'class': 'recipe-details-rating p2-text primary-orange'})
            return EP_Recipe.rating_calculator(self, rating_box)
        except:
            return None
    
    def get_time(self,page):
        try:
            return float(page.find('div',{'class':'recipe-summary-item unit h2-text'}).find('span',{'class':'value font-light h2-text'}).text)
        except:
            return None
    
    def get_price(self,page,url):
        try:
            options=webdriver.ChromeOptions()
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--ignore-ssl-errors')
            browser = webdriver.Chrome('/Users/bear/Downloads/chromedriver', options=options)
            browser.implicitly_wait(10)
            browser.get(url)
            detail = browser.find_element_by_css_selector('.basketful-btn .btn-primary')
            detail.click()
            browser.switch_to.frame('basketful_i1')
            total_price = browser.find_element_by_css_selector('#footer p')
            price = total_price.text.split('$')[-1]
            browser.quit()
            return price
        except:
            return None
      
    def get_calories(self,page):
        try:
            return float(page.find('div',{'class':'recipe-summary-item nutrition h2-text'}).find('span',{'class':'value font-light h2-text'}).text)
        except:
            return None

    def get_nutrition(self,page):
        try:
            nutrition = page.find_all('span',{'class': 'raw-value micro-text'})
            sodium = nutrition[0].text[:-2]
            fat = nutrition[1].text[:-1]
            protein = nutrition[2].text[:-1]
            carbs = nutrition[3].text[:-1]
            return sodium, fat, protein, carbs 
        except:
            return None
    
    def get_personal_rating(self, page):
        try: 
            p_persons = page.find_all('div', {'class': 'review-name'})
            p_ratings_box = page.find_all('div', {'class': 'review-rating'})
            p_r = []            
            for i in range(len(p_ratings_box)):
                temp = [p_persons[i].find('a').text, EP_Recipe.rating_calculator(self, p_ratings_box[i])]
                p_r.append(temp)
            return p_r
        except:
            return None
        
    def get_ingredients(self, page):
        try: 
            main_ingredients=page.find_all('span',{'class':'ingredient'})
            for i in range(len(main_ingredients)):
                main_ingredients[i] = ''.join(ch for ch in main_ingredients[i].text if ch.isalnum() or ch == ' ' or ch =='/') #keep only number/alphabetic/space
            return main_ingredients
        except:
            return None
        
    def build_recipe(self, page,url):
        self.title = self.get_title(page)
        self.rating = self.get_rating(page)
        self.time = self.get_time(page)
        self.price = self.get_price(page,url)
        self.calories = self.get_calories(page)
        self.sodium, self.fat, self.protein, self.carbs  = self.get_nutrition(page)
        self.personal_rating = self.get_personal_rating(page)
        self.ingredients = self.get_ingredients(page)
        self.url = url

    def __init__(self, page):
        print('attempting to build from: '+page)
        try:
            self.build_recipe(bs(url(page), 'html.parser'),page)
        except Exception as x:
            print('Could not build from %s, %s'%(page,x))

## 3. Find all the addresses for recipes and store them in ep_urls

In [4]:
# all_url = ['https://www.yummly.com/recipes']

# ep_urls = set()

# initializer = url(all_url[0])
# res = bs(initializer.read(),"html5lib")
# for div in res.findAll('div', {'class': 'recipe-card-img-wrapper'})[:10]:
#         ep_urls.update(['https://www.yummly.com'+div.find('a')['href']]) 

In [6]:
# browser = webdriver.Chrome('./chromedriver')

# browser.get("https://www.yummly.com/guided-video-recipes")
# time.sleep(10)

# elem = browser.find_element_by_tag_name("body")
# elem.click() 
# no_of_pagedowns = 20

# while no_of_pagedowns:
#     elem.send_keys(Keys.END)
#     time.sleep(10)
#     no_of_pagedowns-=1

# for div in browser.find_elements_by_class_name('recipe-card-img-wrapper'):
#     href = div.find_element_by_tag_name('a').get_attribute('href')
#     ep_urls.update([href]) 

## 4. Scrape the website of recipes and generate the data. Store the data as recipes_data.json.

In [7]:
# p = multiprocessing.Pool(4)
# output = p.map(EP_Recipe,ep_urls)
# pickle.dump(output,open('epi_recipes.final','wb'))

# data = pickle.load(open('epi_recipes.final','rb'))
# ar = []
# for i in data:
#     ar.append(i.__dict__)
# pickle.dump(ar,open('epi_recipe_dict_form.dict','wb'))

# with open('recipes_data.json', 'w') as fp:
#     json.dump(ar, fp)

attempting to build from: https://www.yummly.com/recipe/New-York-Style-Cheesecake-_without-Sour-Cream_-2650354
attempting to build from: https://www.yummly.com/recipe/Herb-_-Garlic-Roasted-Chicken-2696765
attempting to build from: https://www.yummly.com/recipe/Gluten-Free_-Dairy-Free_-Egg-Free-Pumpkin-Pie-9101981
attempting to build from: https://www.yummly.com/recipe/Sweet-Potato-Casserole-with-Bacon-Pecan-Crumble-9073113
Could not build from https://www.yummly.com/recipe/Gluten-Free_-Dairy-Free_-Egg-Free-Pumpkin-Pie-9101981, HTTP Error 500: Internal Server Error
attempting to build from: https://www.yummly.com/recipe/Roasted-Vegan-Thanksgiving-Feast-Platter-9103255
attempting to build from: https://www.yummly.com/recipe/Four-Cheese-Baked-Spaghetti-2684086
attempting to build from: https://www.yummly.com/recipe/Baked-Greek-Chicken-with-Fresh-Lemon-and-Dill-9088936
attempting to build from: https://www.yummly.com/recipe/Juicy-Herbed-Weeknight-Chicken-Thighs-9029462
attempting to build 

## 5. Read the data

In [8]:
df = pd.read_json('recipes_data.json')
df

Unnamed: 0,title,rating,time,price,calories,sodium,fat,protein,carbs,personal_rating,ingredients,url
0,New York Style Cheesecake (without Sour Cream),5.0,2.0,40.37,430.0,210.0,24.0,5,49,"[[Amalia Christine, 5.0]]","[graham crackers , light brown sugar , cinnamo...",https://www.yummly.com/recipe/New-York-Style-C...
1,Four-Cheese Baked Spaghetti,4.5,75.0,38.35,570.0,1230.0,28.0,31,51,"[[Kaitlin Judd, 3.0], [Em Spreadborough, 5.0],...","[spaghetti , lowmoisture ricotta cheese , grat...",https://www.yummly.com/recipe/Four-Cheese-Bake...
2,Skillet Zucchini Noodle Lasagna,,90.0,20.54,460.0,2150.0,30.0,28,21,[],"[zucchini , salt , 85 lean ground beef , black...",https://www.yummly.com/recipe/Skillet-Zucchini...
3,Creamy Spinach Stuffed Mushrooms,4.5,33.0,17.99,70.0,180.0,5.0,3,3,"[[Avery, 1.0], [Ischa Bremer, 4.0], [Kiauna Ga...","[cremini mushrooms , cream cheese , grated Par...",https://www.yummly.com/recipe/Creamy-Spinach-S...
4,Honey-Roasted Chickpeas,1.0,50.0,12.54,570.0,230.0,9.0,28,99,"[[Jaclyn Wieland, 1.0]]","[garbanzo beans , honey , cinnamon , salt , ho...",https://www.yummly.com/recipe/Honey-Roasted-Ch...
...,...,...,...,...,...,...,...,...,...,...,...,...
432,Individual Bacon-Wrapped Meatloaves,4.5,40.0,24.57,290.0,420.0,22.0,16,6,"[[Angie Hall, 5.0], [Sarah Nixon, 4.0]]","[85 lean ground beef , ground veal , ground po...",https://www.yummly.com/recipe/Individual-Bacon...
433,Easy Bread and Butter Roast Beef,2.0,3.0,8.33,60.0,300.0,6.0,<1,1,"[[Ehrlich, 3.0], [Tyler Boer, 1.0]]","[beef crossrib chuck roast , salt , black pepp...",https://www.yummly.com/recipe/Easy-Bread-and-B...
434,Crispy Panko-Parmesan Chicken Breasts,4.5,28.0,17.16,310.0,340.0,16.0,28,11,"[[Jennell Marie, 5.0], [LaGree, 3.0], [Jacquel...","[chicken breasts , panko breadcrumbs , shredde...",https://www.yummly.com/recipe/Crispy-Panko-Par...
435,Sausage and Mushroom Egg Casserole,5.0,55.0,24.23,100.0,300.0,7.0,8,2,"[[Tamara Anderson, 5.0]]","[cremini mushrooms , breakfast sausage , butte...",https://www.yummly.com/recipe/Sausage-and-Mush...


## 6. Data Processing: Delete the row with NaN data and only consider the recipes with at least four reviews.

In [9]:
df = df.dropna(axis=0) #delete row with NaN 
df = df[df['personal_rating'].map(len) > 4] ##delete the recipes with less than 4 reviews
df = df[df['protein'] != "<1"]
df.protein = df.protein.astype(int)
df = df[df['carbs'] != "<1"]
df.carbs = df.carbs.astype(int)

In [10]:
df

Unnamed: 0,title,rating,time,price,calories,sodium,fat,protein,carbs,personal_rating,ingredients,url
1,Four-Cheese Baked Spaghetti,4.5,75.0,38.35,570.0,1230.0,28.0,31,51,"[[Kaitlin Judd, 3.0], [Em Spreadborough, 5.0],...","[spaghetti , lowmoisture ricotta cheese , grat...",https://www.yummly.com/recipe/Four-Cheese-Bake...
3,Creamy Spinach Stuffed Mushrooms,4.5,33.0,17.99,70.0,180.0,5.0,3,3,"[[Avery, 1.0], [Ischa Bremer, 4.0], [Kiauna Ga...","[cremini mushrooms , cream cheese , grated Par...",https://www.yummly.com/recipe/Creamy-Spinach-S...
7,Best Ever Chocolate Chip Muffins,4.5,41.0,30.70,270.0,270.0,14.0,4,34,"[[Sophia, 5.0], [Heidi Crosgrove-Trobaugh, 4.0...","[unsalted butter , granulated sugar , light br...",https://www.yummly.com/recipe/Best-Ever-Chocol...
8,Sheet Pan Yellow Squash and Chicken Sausage,4.5,42.0,20.08,340.0,960.0,24.0,22,10,"[[Holly Lynch, 5.0], [Amy J., 5.0], [Cathleen ...","[yellow summer squash , cooked chicken sausage...",https://www.yummly.com/recipe/Sheet-Pan-Yellow...
9,Hearty Italian Beef and Vegetable Soup,4.5,40.0,23.10,240.0,920.0,7.0,27,19,"[[Isabella, 5.0], [Rejeana Black, 5.0], [Stace...","[medium onion , garlic cloves , large carrots ...",https://www.yummly.com/recipe/Hearty-Italian-B...
...,...,...,...,...,...,...,...,...,...,...,...,...
409,Italian Sausage-Stuffed Mushrooms,4.0,38.0,16.58,360.0,780.0,27.0,16,14,"[[Michael T., 5.0], [Garza O'Shaughnessy, 3.0]...","[cremini mushrooms , cream cheese , grated Par...",https://www.yummly.com/recipe/Italian-Sausage-...
412,Vegetarian Bean and Cheese Taco Casserole,4.5,34.0,31.47,290.0,770.0,14.0,18,27,"[[Kimberly Beasley, 5.0], [leah g., 3.0], [Rav...","[nonstick cooking spray , canned black beans ,...",https://www.yummly.com/recipe/Vegetarian-Bean-...
421,Easy Honey-Mustard Chicken Thighs,4.5,30.0,19.60,310.0,500.0,8.0,39,19,"[[Yazc, 5.0], [7ae19c22-1c55-4c7a-8781-4c9d828...","[cooking spray , whole grain mustard , honey ,...",https://www.yummly.com/recipe/Easy-Honey-Musta...
425,Garlic Roasted Broccoli,4.5,30.0,7.47,130.0,440.0,9.0,5,11,"[[Rebeca Pereira, 5.0], [Lily, 4.0], [JW008, 4...","[broccoli florets , olive oil , salt , black p...",https://www.yummly.com/recipe/Garlic-Roasted-B...


In [11]:
df.to_csv('recipe_info.csv')

## 7. Analysize the user ratings data

In [12]:
p_r = pd.DataFrame(columns=['title', 'user', 'rating'])

In [13]:
count = 0
for i in range(df.shape[0]):
    for j in df.iloc[i,9]:
        p_r.loc[count] = [df.iloc[i,0],j[0], j[1]]
        count += 1

In [14]:
p_r

Unnamed: 0,title,user,rating
0,Four-Cheese Baked Spaghetti,Kaitlin Judd,3.0
1,Four-Cheese Baked Spaghetti,Em Spreadborough,5.0
2,Four-Cheese Baked Spaghetti,Nichole McNeese,5.0
3,Four-Cheese Baked Spaghetti,Strauß,5.0
4,Four-Cheese Baked Spaghetti,Tanya Hollabaugh,4.0
...,...,...,...
1900,Crispy Panko-Parmesan Chicken Breasts,Cheyanne Cruz,4.0
1901,Crispy Panko-Parmesan Chicken Breasts,Bethany Her,5.0
1902,Crispy Panko-Parmesan Chicken Breasts,Natalie Whyte,5.0
1903,Crispy Panko-Parmesan Chicken Breasts,Hshshs,4.0


In [15]:
#Count users' rating
cnt = p_r['user'].value_counts()
cnt

Robert Kelly           8
User Testing Chef      5
Jen                    5
rezist la victoriei    5
aaron gould            4
                      ..
Megan Oothoudt         1
Chaz McGutter          1
Kristi McCracken       1
Lisa S.                1
tyler doran            1
Name: user, Length: 1754, dtype: int64

In [16]:
cnt.describe()

count    1754.000000
mean        1.086089
std         0.398228
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: user, dtype: float64

In [17]:
cnt.value_counts()

1    1643
2      88
3      13
4       6
5       3
8       1
Name: user, dtype: int64

In [18]:
user_reviews2 = p_r['user'].value_counts()[p_r['user'].value_counts() > 0].index
trun_recipes_user_review = p_r[p_r['user'].isin(user_reviews2)]
trun_recipes_user_review

Unnamed: 0,title,user,rating
0,Four-Cheese Baked Spaghetti,Kaitlin Judd,3.0
1,Four-Cheese Baked Spaghetti,Em Spreadborough,5.0
2,Four-Cheese Baked Spaghetti,Nichole McNeese,5.0
3,Four-Cheese Baked Spaghetti,Strauß,5.0
4,Four-Cheese Baked Spaghetti,Tanya Hollabaugh,4.0
...,...,...,...
1900,Crispy Panko-Parmesan Chicken Breasts,Cheyanne Cruz,4.0
1901,Crispy Panko-Parmesan Chicken Breasts,Bethany Her,5.0
1902,Crispy Panko-Parmesan Chicken Breasts,Natalie Whyte,5.0
1903,Crispy Panko-Parmesan Chicken Breasts,Hshshs,4.0


In [19]:
#Construct a 2D matrix for the recipe-user-rating data
trun_recipes_user_review = trun_recipes_user_review.drop_duplicates(['user','title'])
trun_recipes_user_review_matrix = trun_recipes_user_review.pivot(index='user', columns='title', values='rating')
trun_recipes_user_review_matrix

title,20-Minute Pesto Chicken Pizza,30-Minute Sheet Pan Chicken Caprese,4-Ingredient BBQ Baked Salmon,5-Ingredient Cheesecake,5-Ingredient Honey-BBQ Baked Chicken Wings,5-Ingredient Pound Cake,Amazing Rosemary Sweet Potato Fries,Apple Cinnamon Muffins,Apple Pie Baked Oatmeal,Asian Sesame Roasted Broccoli,...,Sweet and Salty Candied Bacon,Sweet and Spicy Pecans,Twice Baked Potatoes,Vegan Baked Ziti,Vegan Banana Bread,Vegan Black Bean Enchiladas,"Vegan Chickpea ""Meatloaf""",Vegan Spinach Lasagna,Vegetarian Bean and Cheese Taco Casserole,Za'atar-Roasted Cauliflower Steaks with Bean Salad
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06,,,,,,,,,,,...,,,,,,,,,,
7ae19c22-1c55-4c7a-8781-4c9d828c62f2,,,,,,,,,,,...,,,,,,,,,,
86d1fa17-1974-4695-973a-36badbb3e65b,,,,,,,,,,,...,,,,,,,,,,
AT,,,,,,,,,,,...,,,,,,,,,,
Aavan,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ștefania Sofian,,,,,,,,,,,...,,,5.0,,,,,,,
Елеонора Војновић,,,,,,,,,,,...,,,,,,,,,,
Ольга Раменская,,,,,,,,,,,...,,,,,,,,,,
Оля Балєнсіага,,,,,,,,,,,...,,,,,,,,,,


## 8. Construct the final data set for recipes-user-rating.

In [20]:
# Considering most user only rate one recipe, the recipes-user-rating matrix is too sparse. Here we see 50 users as one user, by combine the 50 users' rating as one person's rating.
final_rating_data = pd.DataFrame(columns=trun_recipes_user_review_matrix.columns)
for i in range(trun_recipes_user_review_matrix.shape[0]//50):
    temp = trun_recipes_user_review_matrix.iloc[50*i:50*i+50].mean(skipna=True, axis=0)
    temp.name = 'user' + str(i)
    final_rating_data.loc[i] = temp
final_rating_data

title,20-Minute Pesto Chicken Pizza,30-Minute Sheet Pan Chicken Caprese,4-Ingredient BBQ Baked Salmon,5-Ingredient Cheesecake,5-Ingredient Honey-BBQ Baked Chicken Wings,5-Ingredient Pound Cake,Amazing Rosemary Sweet Potato Fries,Apple Cinnamon Muffins,Apple Pie Baked Oatmeal,Asian Sesame Roasted Broccoli,...,Sweet and Salty Candied Bacon,Sweet and Spicy Pecans,Twice Baked Potatoes,Vegan Baked Ziti,Vegan Banana Bread,Vegan Black Bean Enchiladas,"Vegan Chickpea ""Meatloaf""",Vegan Spinach Lasagna,Vegetarian Bean and Cheese Taco Casserole,Za'atar-Roasted Cauliflower Steaks with Bean Salad
0,5.0,,,5.0,5.0,,,,,2.0,...,,,5.0,,,,5.0,5.0,,
1,4.0,4.0,5.0,5.0,5.0,3.0,4.0,5.0,4.0,,...,1.0,,5.0,5.0,,5.0,,,,
2,,,5.0,,4.0,1.0,,,,4.0,...,4.0,,5.0,5.0,5.0,5.0,,,,
3,5.0,5.0,5.0,,5.0,5.0,,,,3.0,...,5.0,,,,,5.0,,,,
4,4.5,4.5,,5.0,5.0,,,,,5.0,...,5.0,,5.0,5.0,,,,4.0,,4.0
5,5.0,4.0,4.0,5.0,5.0,,,,,,...,,4.0,4.0,,,,,,5.0,
6,5.0,,4.0,,,,,4.0,,1.0,...,,,,,,5.0,,,,
7,,5.0,,,5.0,4.0,,,,4.0,...,,,,,,,5.0,5.0,,
8,5.0,5.0,4.0,5.0,,5.0,5.0,,,,...,5.0,,,,5.0,5.0,,,,
9,,,4.5,,3.0,,,,,5.0,...,,,3.0,,,5.0,,,5.0,


In [21]:
#Save the data to csv file for future use
final_rating_data.to_csv('final_rating_data.csv')