In [175]:
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
import json
from typing import List, Tuple, Dict

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


In [176]:
base_url = 'https://www.sephora.com'
crawl_delay=5
DRIVER_PATH = '../../chromedriver_mac64/chromedriver'
data_dir = "data/"

options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))

In [184]:
def get_sku(soup) -> str:
    """
    Returns sku code from product page in format 'Item #######'
    """
    return soup.find('p', attrs={'data-at':'item-sku'}).text    


def get_breadcrumb_categories(soup) -> List:
    """
    Returns list of categorical values used to describe product in header of product page
        for ex. ['Skincare','Moisturizer']
    """
    try:
        return [x.text for x in soup.find('nav', attrs={'data-comp':"ProductBreadCrumbs BreadCrumbs BreadCrumbs "}).findAll('li')]
    except:
        return None

def get_brand_name(soup) -> str:
    """
    Returns product brand name from product page
    """
    try:
        return soup.find("a", attrs={'data-at':"brand_name"}).text
    except:
        return None

def get_product_name(soup) -> str:
    """
    Returns product name as written on product page
    """
    try:
        return soup.find("span", attrs={'data-at':"product_name"}).text
    except:
        return None


def get_num_loves(soup) -> str:
    """
    Returns number of 'love' votes for product
        loves seem to be used to track product frequently repurchased
    """
    return soup.find("div", attrs={"data-comp": "LovesCount "}).span.text


def get_ingredients(soup) -> str:
    """
    Returns full ingredient list as blob of text
    """
    ig = soup.find("div", {"id": "ingredients"})
    if ig is not None:
        return ig.text
    else:
        return None


def get_rating_data(soup) -> Tuple[str, str]:
    """
        Sephora product page displays a 1-5 bar histogram of votes but it is difficult to retrieve the histogram data
        ******might be able to figure this out later
        Returns star rating and number of reviews as tuple
    """
    try:
        rr_container = soup.find("a", {"href": "#ratings-reviews-container"})
        star_rating = rr_container.find('span', attrs={'data-at':'star_rating_style'})['style']
        num_reviews = rr_container.text
        return star_rating, num_reviews
    except:
        return None, None
    
    
def get_product_buttons(driver, click_delay=0.5) -> Dict:
    """

    """
    product_options = []
    for x in driver.find_elements(By.XPATH, "//div[@data-comp='SwatchGroup ']"):
        buttons = x.find_elements(By.TAG_NAME, "button")
        for button in buttons:
            time.sleep(click_delay)
            try:
                button.click()
            except:
                print('reached nonclickable web element')
                return product_options
                
            product_info = {}
            product_info['swatch_group'] = x.find_element(By.TAG_NAME, "p").text
            try:
                product_info['size'] = driver.find_element(By.XPATH, "//span[@data-at='sku_size_label']").text
            except:
                product_info['size'] = None
            try:
                product_info['name'] = driver.find_element(By.XPATH, "//div[@data-at='sku_name_label']").text
            except:
                product_info['name'] = None
            product_info['price'] = driver.find_element(By.XPATH, "//p[@data-comp='Price ']//b").text
            product_info['sku'] = driver.find_element(By.XPATH, "//p[@data-at='item_sku']").text
            product_options.append(product_info)
    return product_options



In [None]:
test_counter = 0
# # for each brand, grab products on brand page, save products as list of links 
for brand in brand_data:
    print(brand['name'])
    url = base_url+brand['link']
    product_urls = []
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
    driver.get(url)
    #https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
    SCROLL_PAUSE_TIME = 0.5
    y=0

    # Get scroll height
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    print("Expecting ", soup.find("p", attrs={'data-at':'number_of_products'}).getText())
    
    products_on_load = soup.find_all('a', attrs={'data-comp':"ProductTile "}, href=True)
    product_urls.extend([prod['href'].split(" ")[0] for prod in products_on_load])
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        lazy_products = driver.find_elements_by_xpath('//a[@data-comp="LazyLoad ProductTile "]')
        product_urls.extend([prod.get_attribute('href') for prod in lazy_products])
        driver.execute_script("window.scrollTo(0, "+str(y)+");")
        y+=1000
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height < y:
            try:
                # End of page if 'show more' button exists
                driver.find_element(By.XPATH, "//button[@class='css-bk5oor eanm77i0']").click()
            except:
                # End of pages
                product_urls.extend([prod.get_attribute('href') for prod in driver.find_elements_by_xpath('//a[@data-comp="LazyLoad ProductTile "]')])
                break
        last_height = new_height    
    brand['products'] = list(set(product_urls))
    print("Retrieved ", len(brand['products']))
    driver.quit()
    time.sleep(crawl_delay)
    # Serializing json
    json_object = json.dumps(brand_data, indent=4)
    with open(data_dir+"brand_data.json", "w") as outfile:
        outfile.write(json_object)
    if test_counter%16==0:
        print(test_counter)
    test_counter+=1

In [178]:
issue_brands = ['Tom Ford', 'tarte', 'Moroccanoil', 'Dior', 'Anastasia Beverly Hills']
prods = ['clinique']
# url = 'https://www.sephora.com/ca/en/product/charlotte-tilbury-airbrush-flawless-setting-spray-P461147?skuId=2368439&icid2=products%20grid:p461147:product'

# url = 'https://www.sephora.com/ca/en/product/ambient-lighting-blush-collection-P384963?skuId=1581321&icid2=products%20grid:p384963:product'
# lots of options all same size
# url = "https://www.sephora.com/ca/en/product/saie-glowy-super-skin-lightweight-hydrobounce-serum-foundation-P504907?icid2=new_ca_skugrid_ufe:p504907:product"
# ran into issue getting product name
# url = "https://www.sephora.com/ca/en/product/book-personal-travel-spray-P501954?icid2=homepage_productlist_brandnewadditions_ca_rwd_092022"
# value set works
# url = 'https://www.sephora.com/ca/en/product/laneige-midnight-to-morning-hydration-set-P504208?skuId=2639557&icid2=products%20grid:p504208:product'
# hair brush with no ingredients
# url = 'https://www.sephora.com/ca/en/product/sephora-collection-scalp-massager-P472069?skuId=2414555&icid2=products%20grid:p472069:product'
# new prod
# url = 'https://www.sephora.com/ca/en/product/gxve-by-gwen-stefani-check-my-glow-platinum-glow-highlighter-P505655?icid2=new_ca_skugrid_ufe:p505655:product'
# sale prod
# url = 'https://www.sephora.com/ca/en/product/green-microalgae-retinol-regenerating-serum-P506077?skuId=2672301&icid2=products%20grid:p506077:product'
# no reviews yet or ratings

In [179]:
with open(data_dir+'brand_data.json') as file:
    brand_data = json.loads(file.read())



In [192]:
start_time = time.time()
# value kits will need to be separate or excluded...
crawl_delay = 5
offset = 227

for i, brand in enumerate(brand_data[offset:]):
    print("brand # ", str(i+offset) + brand["name"])
    product_data = []    
    for url in brand["products"]:
        time.sleep(crawl_delay)
        product = {}
        product["url"] = url
        # get class names of buttons and grab prices with selenium 
        driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        print(product)
        try:
            h1 = soup.find('h1').text
        except:
            product_data.append(product)
            driver.quit()
            pass
            
        if h1 != 'Sorry, this product is not available.' and h1!='Sorry! The page you’re looking for cannot be found.':
            product["product_name"] = get_product_name(soup)
            product["brand_name"] = get_brand_name(soup)
            product["options"] = get_product_buttons(driver)
            product["rating"], product["product_reviews"] = get_rating_data(soup)
            product["ingredients"] = get_ingredients(soup)
            product["n_loves"] = get_num_loves(soup)
            product["categories"] = get_breadcrumb_categories(soup)
        product_data.append(product)
        driver.quit()
    fname = 'data/products/'+brand["name"].replace("/","")+".json"
    
    print("Saving ", fname)
    with open(fname, "w") as outfile:
        outfile.write(json.dumps(product_data, indent=4))

        # broken page https://www.sephora.com/ca/en/product/beautyblender-bronze-besties-P505630?skuId=2662559&icid2=products
end_time = time.time()

brand #  227TAN-LUXE
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-the-gradual-illuminating-gradual-tan-lotion-P469201?skuId=2444867&icid2=products%20grid:p469201:product'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-the-face-illuminating-self-tan-drops-light-medium-P469879?skuId=2444826&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-the-body-illuminating-self-tan-drops-light-medium-P469198?skuId=2444842&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-super-glow-hyaluronic-self-tan-serum-P469190?skuId=2444875&icid2=products%20grid:p469190:product'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-the-face-illuminating-self-tan-drops-medium-dark-P469880?skuId=2444834&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-the-gradual-illuminating-gradual-tan-lotion-P469201?skuId=2444867&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/tan-luxe-super-glow-hyaluronic-self-tan-se

In [None]:
end_time - start_time

In [112]:
with open('data/products/beautyblender.json') as file:
    test = pd.DataFrame(json.loads(file.read()))

