# Collect food images and other information from websites
including url, title, nutrients, ingredients, serving, and images from two food websites.

## Scrape from www.bbcgoodfood.com

In [3]:
import os
import re
import shutil
import requests
import pandas as pd
from PIL import Image
from recipe_scrapers import scrape_me
import sys
sys.setrecursionlimit(50000)

### get urls of recipes

In [4]:
def category_information(name, pages):
    dic = {}
    dic['name'] = name
    dic['pages'] = pages
    #dic['number'] = recipe_number

    return dic


def define_required_data():
    raw_data = []
    categories = ['burger', 'salad', 'cake', 'steak', 'rice', 'sandwich', 'pizza', 'cookies', 'soup', 'pasta', 'pie', 'bread']
    pages = [18, 53, 36, 12, 23, 9, 7, 10, 21, 20, 20, 20]
    total_numbers = [421, 1271, 857, 269, 531, 193, 146, 221, 490, 474, 471, 459]
    
    for i in range(len(categories)):
        raw_data.append(category_information(categories[i], pages[i]))

    return raw_data


def get_search_urls(dic):
    urls = []
    for i in range(1, dic['pages']+1):
        url = 'https://www.bbcgoodfood.com/search/recipes/page/' + str(i) + '/?q=' + dic['name'] + '&sort=-relevance'
        urls.append(url)
    
    return urls


def get_recipe_urls(page_urls):
    # loop through pages
    recipe_links = []
    for page_url in page_urls:
        scraper = scrape_me(page_url)
        links = scraper.links()
        recipe_links = extract_recipe_links(recipe_links, links)
    
    return recipe_links


def extract_recipe_links(recipe_links, links):
    for link in links:
        try:
            if link['class'] == ['standard-card-new__article-title', 'qa-card-link']:
                recipe_links.append('https://www.bbcgoodfood.com'+link['href'])
        except:
            continue
    
    return recipe_links

### scrape information from recipe urls

In [9]:
def scrape_from_web(categories):
    # loop through each category
    for category in categories:
        name = category['name']
        page_urls = get_search_urls(category)
        print("Scraping " + name + "...")
        
        if not os.path.exists('./extracted_data/bbc/'):
            os.mkdir('./extracted_data/bbc/')
        
        path = './extracted_data/bbc/' + name + '/'
        if not os.path.exists(path):
            os.mkdir(path)
        
        # get structured information
        recipe_links = get_recipe_urls(page_urls)
        structured_information = extract_from_recipes(recipe_links)
        download_images(path, structured_information, category)

        df = pd.DataFrame(structured_information)
        df.to_csv('./csv_files/bbc/' + name + '.csv', index=False)


def extract_from_recipes(recipe_links):
    # combine structured information of each recipe
    structured_information = []
    for recipe_link in recipe_links:
        temp = structured_information_in_recipe(recipe_link)
        # remove recipes that contain incomplete information
        if temp == {} or temp['nutrients'] == {} or temp['ingredients'] == {} or temp['image'] == {}:
            continue
        else:
            structured_information.append(temp)
    return structured_information


def structured_information_in_recipe(recipe_link):
    print(recipe_link)
    recipe = {}
    scraper = scrape_me(recipe_link)
    recipe['title'] = scraper.title()
    recipe['image'] = scraper.image()
    recipe['ingredients'] = scraper.ingredients()
    recipe['nutrients'] = scraper.nutrients()
    recipe['serving'] = scraper.yields()
    recipe['url'] = scraper.url
    
    return recipe

    
def download_images(path, structured_information, category):
    # download images from image links
    print("Downloading images...")
    index = 0
    for recipe in structured_information:
        image = requests.get(recipe['image'])
        with open(path + category + str(index)+'.jpg', 'wb') as f:
            f.write(image.content)
        index += 1


In [None]:
categories = define_required_data()
scrape_from_web(categories)

### Rename files to class+index.jpg (e.g., burger0.jpg)
add filename and category as two new columns (file_name, category)

In [None]:
# sort by 0,1,2,3..., instead of 1, 10, 11... 
def sort_key(s):
    try:
        c = re.findall('\d+', s)[0]
    except:
        c = -1
    return int(c)

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

# rename images from 0.jpg to burger0.jpg
for category in categories:
    path = './extracted_data/bbc/' + category + '/'
    names = os.listdir(path)
    for name in names:
        os.rename(path+name, path+category+name)

        
# rename csv files
for category in categories:
    print(category)
    names = os.listdir('./extracted_data/bbc/' + category + '/')
    names.sort(key=sort_key)
    
    #delete '.DS_Store' and 'cake.DS_Store'
    if names[1].endswith('DS_Store'):
        del names[1]
    if names[0].endswith('DS_Store'):
        del names[0]
    
    csv_file = pd.read_csv('./csv_files/bbc/' + category + '.csv')
    csv_file['file_name'] = names
    csv_file['category'] = [category]*len(csv_file)
    csv_file.to_csv('./csv_files/bbc/' + category + '.csv', index=False)

### move 85% images of each category to training folder, 15% to test folder
Make sure that each class is included in the test set.  Add "source" as a new column.

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

# make sure the csv files are changed too
train_data = pd.DataFrame()
test_data = pd.DataFrame()

for category in categories:
    print(category)
    path = './csv_files/bbc/'
    for path, subpath, files in os.walk(path):
        files.sort()
        for i in files:
            if i.endswith(category + ".csv"):
                data = pd.read_csv(path + i)
                train_temp = data[:-round(len(data)*0.15)]
                test_temp = data[-round(len(data)*0.15):]
                
                train_data = train_data.append(train_temp,ignore_index=True)
                test_data = test_data.append(test_temp, ignore_index=True)
                            
                # move training images 
                for i in range(len(train_temp)):
                    src_path = './extracted_data/bbc/' + category + '/' + category + str(i)
                    try:
                        shutil.move(src_path + '.jpg', './data/all_data/')
                    except:
                        shutil.move(src_path + '.png', './data/all_data/')
                
                # move test images
                for i in range(len(test_temp)):
                    src_path = './extracted_data/bbc/' + category + '/' + category + str(len(data)-i-1)
                    try:
                        shutil.move(src_path + '.jpg', './data/all_data/')
                    except:
                        shutil.move(src_path + '.png', './data/all_data/')

                        
train_data['source'] = ['bbc']*len(train_data)
test_data['source'] = ['bbc']*len(test_data)
train_data.to_csv('./data/bbc_train.csv', index=False)
test_data.to_csv('./data/bbc_test.csv', index=False)

### Integrate images from train/test folder back to original folders

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

try:
    os.remove('./data/bbc_train.csv')
    os.remove('./data/bbc_test.csv')
except:
    pass

for category in categories:
    src1 = './data/all_data/'
    src2 = './data/all_data/'
    dst = './extracted_data/bbc/' + category + '/'
    
    for path, subpath, files in os.walk(src1):
        files.sort()
        for i in files:
            if i.startswith(category):
                shutil.move(src1 + i, dst)   
                
    for path, subpath, files in os.walk(src2):
        files.sort()
        for i in files:
            if i.startswith(category):
                shutil.move(src2 + i, dst)  

## Scrape from www.thekitchn.com 

In [None]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('User-Agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36')

### get urls of search results

In [None]:
# get urls of recipes

def category_information(name, pages):
    dic = {}
    dic['name'] = name
    dic['pages'] = pages
    #dic['number'] = recipe_number

    return dic


def define_required_data():
    raw_data = []
    categories = ['burger', 'salad', 'cake', 'steak', 'rice', 'sandwich', 'pizza', 'cookies', 'soup', 'pasta', 'pie', 'bread']
    pages = [2, 23, 50, 8, 25, 7, 6, 11, 12, 16, 13, 12]
    total_numbers = [35, 452, 1687, 151, 500, 136, 113, 213, 240, 320, 260, 240]
    
    for i in range(len(categories)):
        raw_data.append(category_information(categories[i], pages[i]))

    return raw_data


def get_search_urls(dic):
    urls = []
    for i in range(1, dic['pages']+1):
        url = 'https://www.thekitchn.com/search?filter=recipes&page=' + str(i) + '&q=' + dic['name']
        urls.append(url)
    
    return urls


### get urls of recipes

In [None]:
def get_recipe_links(search_results):
    # input: urls of search result  of a specific category
    # output: urls of recipes of a specific category

    # initialize selenium instance, wait for a while to load the content
    driver = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)
    driver.implicitly_wait(10)

    recipe_links = []
    last_links = 0
    
    for search_result in search_results:
        recipe_links = parse_search_results(driver, search_result, recipe_links)
        # if it fails at extracting this page (e.g., robot test), then keep trying...
        while last_links == len(recipe_links):
            recipe_links = parse_search_results(driver, search_result, recipe_links)
            time.sleep(5)
        time.sleep(5)
        last_links = len(recipe_links)
        print(last_links)
    driver.quit()
    
    return recipe_links


def parse_search_results(driver, search_result, recipe_links):
    driver.get(search_result)
    html = driver.page_source
    soup = BeautifulSoup(html)
    print(search_result)

    page_links = soup.find_all('a', attrs={'class', 'Teaser__headline Teaser__link'})
    for i in range(len(page_links)):
        recipe_link = page_links[i]['data-gtm-search-url']
        recipe_links.append(recipe_link)
    return recipe_links


### scrape information from recipe urls

In [None]:
def extrct_structured_information(recipe_links):
    # input: recipe links of a specific category
    # output: list of dictionaries of nutrients, ingredients etc. contained in each recipe
    
    structured_information = []
    
    for recipe_link in recipe_links:
        temp = get_structured_information(recipe_link)
        if temp == {} or temp['Calories '] == 'NaN' or temp['image'].endswith('gif'):
            continue
        else:
            structured_information.append(temp)
    
    return structured_information
    
    
def get_structured_information(recipe_link):
    # input: recipe url
    # output: structured information including title etc.
    print(recipe_link)
    recipe = {}
    try:
        scraper = scrape_me(recipe_link)
        recipe['title'] = scraper.title()
        recipe['link'] = scraper.url()
        recipe['image'] = scraper.image()
        recipe['ingredients'] = scraper.ingredients()
        recipe['nutrients'] = get_nutrients(scraper)
        recipe['serving'] = get_serving(scraper)
        return recipe
    except:
        print('something went wrong...↑')
        return recipe


def get_nutrients(scraper):
    # scrape nutrients from soup
    
    information = scraper.soup()[0]
    nutrient = {}

    nutrient_names = information.find_all('span', attrs={'class':'jsx-1041931414 NutritionalGuide__nutrient-name'})
    nutrient_values = information.find_all('span', attrs={'class':'jsx-1041931414 NutritionalGuide__nutrient-quantity'})

    for i in range(len(nutrient_names)):
        nutrient_name = nutrient_names[i].text
        nutrient_value = nutrient_values[i].text
        nutrient[nutrient_name] = nutrient_value
    
    return nutrient
    

def get_serving(scraper):
    
    information = scraper.soup()[0]
    try:
        yield_serving = information.find_all('p', attrs={'class':'jsx-2401602051 Recipe__yield'})[0].text
        serving = int(re.findall('\d+', yield_serving)[0])
    except:
        serving = None
    return serving


def download_images(path, structured_information):
    index = 0
    for recipe in structured_information:
        image = requests.get(recipe['image'])
        with open(path + str(index)+'.jpg', 'wb') as f:
            f.write(image.content)
        index += 1
        

In [None]:
categories = define_required_data()

for category in categories:
    name = category['name']
    search_results = get_search_urls(category)
    recipe_links = get_recipe_links(search_results)
    structured_information = extrct_structured_information(recipe_links)
    
    if not os.path.exists('./extracted_data/thekitchen/'):
        os.mkdir('./extracted_data/thekitchen/')
        
    path = './extracted_data/thekitchen/' +  name + '/'
    if not os.path.exists(path):
        os.mkdir(path)

    df = pd.DataFrame(structured_information)
    df.to_csv('./csv_files/thekitchen/' + name + '.csv', index=False)
    
    # download images from extracted image links
    download_images(path, structured_information)

### Rename files to kitchen_class+index.jpg (e.g., kitchen_burger0.jpg)
add filename and category as two new columns

In [None]:
# sort by 0,1,2,3..., instead of 1, 10, 11... 
def sort_key(s):
    try:
        c = re.findall('\d+', s)[0]
    except:
        c = -1
    return int(c)

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

for category in categories:
    path = './extracted_data/thekitchen/' + category + '/'
    names = os.listdir(path)
    for name in names:
        os.rename(path+name, path+'kitchen_'+category+name)
        
        
# rename csv files
for category in categories:
    print(category)
    names = os.listdir('./extracted_data/thekitchen/' + category + '/')
    names.sort(key=sort_key)
    
    #delete '.DS_Store' and 'cake.DS_Store'
    if names[1].endswith('DS_Store'):
        del names[1]
    if names[0].endswith('DS_Store'):
        del names[0]
    
    csv_file = pd.read_csv('./csv_files/thekitchen/' + category + '.csv')
    csv_file['file_name'] = names
    csv_file['category'] = [category]*len(csv_file)
    csv_file.to_csv('./csv_files/thekitchen/' + category + '.csv', index=False)

### move 85% images of each category to training folder, 15% to test folder
Make sure that each class is included in the test set.  Add "source" as a new column.

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

train_data = pd.DataFrame()
test_data = pd.DataFrame()

for category in categories:
    print(category)
    path = './csv_files/thekitchen/'
    for path, subpath, files in os.walk(path):
        files.sort()
        for i in files:
            if i.endswith(category + ".csv"):
                data = pd.read_csv(path + i)
                train_temp = data[:-round(len(data)*0.15)]
                test_temp = data[-round(len(data)*0.15):]
                
                train_data = train_data.append(train_temp, ignore_index=True)
                test_data = test_data.append(test_temp, ignore_index=True)
            
                # Move training images
                for i in range(len(train_temp)):
                    src_path = './extracted_data/thekitchen/' + category + '/kitchen_' + category + str(i)
                    try:
                        shutil.move(src_path + '.jpg', './data/all_data/')
                    except:
                        if os.path.exists(src_path + '.jp2'):
                            shutil.move(src_path + '.jp2', './data/all_data/')
                        else:
                            continue
                
                # Move test images
                for i in range(len(test_temp)):
                    src_path = './extracted_data/thekitchen/' + category + '/kitchen_' + category + str(len(data)-i-1)
                    try:
                        shutil.move(src_path + '.jpg', './data/all_data/')
                    except:
                        if os.path.exists(src_path + '.jp2'):
                            shutil.move(src_path + '.jp2', './data/all_data/')
                        else:
                            continue
        
train_data['source'] = ['kitchen']*len(train_data)
test_data['source'] = ['kitchen']*len(test_data)

train_data.to_csv('./data/kitchen_train.csv', index=False)
test_data.to_csv('./data/kitchen_test.csv', index=False)

### Integrate images from train/test folder back to original folders

In [None]:
categories = ['bread', 'burger', 'cake', 'cookies', 'pasta', 'pie', 
              'pizza', 'rice', 'salad', 'sandwich', 'soup', 'steak']

try:
    os.remove('./data/kitchen_train.csv')
    os.remove('./data/kitchen_test.csv')
except:
    pass

for category in categories:
    src1 = './data/all_data/'
    src2 = './data/all_data/'
    dst = './extracted_data/thekitchen/' + category + '/'
    
    for path, subpath, files in os.walk(src1):
        files.sort()
        for i in files:
            if i.startswith('kitchen_'+ category):
                shutil.move(src1 + i, dst)   
                
    for path, subpath, files in os.walk(src2):
        files.sort()
        for i in files:
            if i.startswith('kitchen_'+ category):
                shutil.move(src2 + i, dst)  