In [None]:
rawData_file_path = './long_info370_assignment1_rawData.csv'
cleanData_file_path = './long_info370_assignment1_cleanData.csv'
calculated_file_path = './long_info370_assignment1_calculate.csv'

In [None]:
import requests
import time
import os
import csv

from bs4 import BeautifulSoup
from selenium import webdriver

# Scrapes the website of interest, gather the required text data, 
# and place it into the correct format. This script outputs results to rawData_file_path

# Pages to be scaped
# Note: It takes really a long time to run webdriver on each page and wait for the ads loading.
#       Can comment out the rest and leave only one page for testing purpose
pages = [
#         'https://www.yummly.com/recipes/lunch',
#         'https://www.yummly.com/recipes/dinner',
#         'https://www.yummly.com/browse/recommended',
#         'https://www.yummly.com/browse/seasonal',
#         'https://www.yummly.com/browse/popular-now',
        'https://www.yummly.com/browse/quick-and-easy'
        ]

# Download webdriver from: https://chromedriver.storage.googleapis.com/index.html?path=2.33/
# and put it in the same directory of the scripts
# Download selenium from: https://pypi.python.org/pypi/selenium
driver_path = os.path.dirname(os.path.abspath("__file__")) + '/chromedriver'
driver = webdriver.Chrome(driver_path)

f = open(rawData_file_path, 'w')
writer = csv.writer(f, delimiter=',')

# recipe page base url
base_url = 'https://www.yummly.com/#recipe/'

for page in pages:

    # Request webpage information
    page_req = requests.get(page)
    page_soup = BeautifulSoup(page_req.text, 'html.parser')
    recipe_grids = page_soup.find_all("div", {"class" : "single-recipe"})

    first_time=True
    for recipe_grid in recipe_grids: # go to each recipe from the links in recipe_grids
        recipe_link = base_url + recipe_grid.get('data-url')
        page = driver.get(recipe_link)

        # Sleep for page loading: first time 3 secs, and 1 secs for the rest
        if first_time:
            time.sleep(3)
            first_time = False
        else:
            time.sleep(1)

        # Extract the information    
        recipe_page = driver.page_source
        recipe_soup = BeautifulSoup(recipe_page, 'html.parser')
        if recipe_soup.find("div", {"class" : "primary-info-text"}):
            recipe_name = recipe_soup.find("div", {"class" : "primary-info-text"}).find({"h1"}).get_text()
            ingredient_elements = recipe_soup.find_all("span", {"class" : "ingredient"})
            ingredients = [(*map(lambda ele:ele.get_text(), ingredient_elements))]
            for ingredient in ingredients:         
                writer.writerow([recipe_link, recipe_name, ingredient])
f.close()

In [None]:
import csv

# Takes the data in rawData_file_path and clean it. This includes 
# removing excess white spaces, correcting for all edge cases, and correcting any remaining formatting issues. 
# This script should output results to cleanData_file_path
f = open(cleanData_file_path, 'w')
writer = csv.writer(f, delimiter=',')

with open(rawData_file_path, "r") as csvDataFile:
    cleanedRow = ''  
    row_set = []  # Check for duplicate entries
    csvReader = csv.reader(csvDataFile)
    for row in csvReader:
        if row not in row_set:
            cleanedRow = [(*map(lambda ele:ele.strip().lower(), row))] # all to lower case and strip any whitespace 
            writer.writerow(cleanedRow)
            row_set.append(row)
f.close()

In [None]:
import operator
import csv


# Takes data in cleanData_file_path and calculate the top 10 
# most frequently occurring ingredients in the ingredient list and output to calculated_file_path
with open(cleanData_file_path, "r") as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    word_freq_dict = {} # for calculating overall counts
    word_prop_dict = {} # for calculating proportion
    recipes_num = 0
    recipe_set =[]
    for row in csvReader:
        recipe = row[1]
        # Calculate total number of recipes
        if recipe not in recipe_set:
            recipe_set.append(recipe) 
            recipes_num += 1

        words = row[2].split()
        words_list = [] # for looking for word duplicates in one row
        for word in words:
            # Update occurrences of the word in word_freq_dict
            if word not in word_freq_dict:
                word_freq_dict[word] = 1
            else:
                word_freq_dict[word] += 1

            # only count word once if there're duplicates in one row
            if word not in words_list:
                if word not in word_prop_dict:
                    word_prop_dict[word] = 1
                else:
                    word_prop_dict[word] += 1
                words_list.append(word)

# Sort word_prop_dict based on word occurrences    
sorted_word_prop_dict = sorted(word_prop_dict.items(), key=operator.itemgetter(1))
desc_sorted_word_prop_dict = tuple(reversed(sorted_word_prop_dict))

f = open(calculated_file_path, 'w')
writer = csv.writer(f, delimiter=',')

for x in range(0,10): # top 10
    word = desc_sorted_word_prop_dict[x][0]
    prop = int((word_prop_dict[word])) / (recipes_num)
    count = word_freq_dict[word]
    writer.writerow([word, count, prop])

f.close()