In [None]:
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
import json
from typing import List, Tuple, Dict

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


In [None]:
base_url = 'https://www.sephora.com'
crawl_delay=6
DRIVER_PATH = '../../chromedriver_mac64/chromedriver'
data_dir = "data/"

options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))

In [None]:
def get_sku(soup) -> str:
    """
    Returns sku code from product page in format 'Item #######'
    """
    return soup.find('p', attrs={'data-at':'item-sku'}).text    


def get_breadcrumb_categories(soup) -> List:
    """
    Returns list of categorical values used to describe product in header of product page
        for ex. ['Skincare','Moisturizer']
    """
    return [x.text for x in soup.find('nav', attrs={'data-comp':"ProductBreadCrumbs BreadCrumbs BreadCrumbs "}).findAll('li')]


def get_brand_name(soup) -> str:
    """
    Returns product brand name from product page
    """
    return soup.find("a", attrs={'data-at':"brand_name"}).text


def get_product_name(soup) -> str:
    """
    Returns product name as written on product page
    """
    return soup.find("span", attrs={'data-at':"product_name"}).text


def get_num_loves(soup) -> str:
    """
    Returns number of 'love' votes for product
        loves seem to be used to track product frequently repurchased
    """
    return soup.find("div", attrs={"data-comp": "LovesCount "}).span.text


def get_ingredients(soup) -> str:
    """
    Returns full ingredient list as blob of text
    """
    return soup.find_all("div", {"id": "ingredients"})[0].text


def get_rating_data(soup) -> Tuple[str, str]:
    """
        Sephora product page displays a 1-5 bar histogram of votes but it is difficult to retrieve the histogram data
        ******might be able to figure this out later
        Returns star rating and number of reviews as tuple
    """
    rr_container = soup.find("a", {"href": "#ratings-reviews-container"})
    star_rating = rr_container.find("span", {"data-comp":"StarRating "})['aria-label']
    num_reviews = rr_container.text
    return star_rating, num_reviews
    
    
def get_product_buttons(soup) -> Dict:
    """
    Products with mutliple size options will have different volume - price options
    Returns all product options with class value included
        Class is used to click buttons with selenium to fetch price for each option
    """
    size_options = []
    for prod_option in soup.find_all("div", attrs={"data-comp":"SwatchGroup "}):
        selected = {}
        selected["name"] = prod_option.find("p").text
        selected["size"] = prod_option.button()[0].text
        selected["class"] = prod_option.find("button")['class'][0]
        size_options.append(selected)
    return size_options


def get_all_brands(soup):
    """
    """
    # collecting brand names and links from brand list page 
    brand_data = []
    for brand_link in soup.findAll('a', attrs={"data-at": "brand_link"}):
        brand = {}
        brand['name'] = brand_link.span.text
        brand['link'] = brand_link.get('href') 
        brand_data.append(brand)
    return brand_data


def get_brand_products():
    """
    """
    return None

In [None]:
# collecting brand names and links from brand list page 
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
url = "https://www.sephora.com/ca/en/brands-list"
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
brand_data = get_all_brands(soup)
# save brands 
pd.DataFrame(brand_data).to_csv(data_dir+'brand_list.csv', index=False)


In [None]:
test_counter = 0
# # for each brand, grab products on brand page, save products as list of links 
for brand in brand_data:
    print(brand['name'])
    url = base_url+brand['link']
    product_urls = []
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
    driver.get(url)
    #https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
    SCROLL_PAUSE_TIME = 0.5
    y=0

    # Get scroll height
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    print("Expecting ", soup.find("p", attrs={'data-at':'number_of_products'}).getText())
    
    products_on_load = soup.find_all('a', attrs={'data-comp':"ProductTile "}, href=True)
    product_urls.extend([prod['href'].split(" ")[0] for prod in products_on_load])
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        lazy_products = driver.find_elements_by_xpath('//a[@data-comp="LazyLoad ProductTile "]')
        product_urls.extend([prod.get_attribute('href') for prod in lazy_products])
        # Scroll down
        driver.execute_script("window.scrollTo(0, "+str(y)+");")
        y+=1000
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height < y:# last_height:
            try:
                # End of page if 'show more' button exists
                driver.find_element(By.XPATH, "//button[@class='css-bk5oor eanm77i0']").click()
            except:
                # End of pages
                product_urls.extend([prod.get_attribute('href') for prod in driver.find_elements_by_xpath('//a[@data-comp="LazyLoad ProductTile "]')])
                break
        last_height = new_height    
    brand['products'] = list(set(product_urls))
    print("Retrieved ", len(brand['products']))
    driver.quit()
    time.sleep(crawl_delay)
    # Serializing json
    json_object = json.dumps(brand_data, indent=4)
    with open(data_dir+"brand_data.json", "w") as outfile:
        outfile.write(json_object)
    if test_counter%16==0:
        print(test_counter)
    test_counter+=1

In [None]:
issue_brands = ['Tom Ford', 'tarte', 'Moroccanoil', 'Dior', 'Anastasia Beverly Hills']

In [None]:
# value kits will need to be separate or excluded...

for brand in brand_data: 
    product_data = []    
    for url in brand["products"]:
        time.sleep(crawl_delay)

        # url = 'https://www.sephora.com/ca/en/product/charlotte-tilbury-airbrush-flawless-setting-spray-P461147?skuId=2368439&icid2=products%20grid:p461147:product'
        product = {}
        product["url"] = url

        # get class names of buttons and grab prices with selenium 
        driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        option_buttons = get_product_buttons(soup)
        for button_info in option_buttons:
            button_element = driver.find_element(By.XPATH, "//button[@class='"+button_info['class']+"']")
            button_element.click()
            button_info['price'] = driver.find_element(By.XPATH, "//b[@class='css-0']").text

        driver.quit()
        print(product)
        if soup.find_all('h1') is not None and soup.find('h1').text != 'Sorry, this product is not available.':
            product["product_name"] = get_product_name(soup)
            product["brand_name"] = get_brand_name(soup)
            product["options"] = option_buttons
            # product["description"] = get_description(soup)
            product["rating"], product["n_reviews"] = get_rating_data(soup)
            product["ingredients"] = get_ingredients(soup)
            product["n_loves"] = get_num_loves(soup)
            product["categories"] = get_breadcrumb_categories(soup)
            product_data.append(product)
            brand['product_data'] = product_data
            brand['sku'] = get_sku(soup)