In [12]:
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd

import time
import re

In [2]:
base_url = 'https://www.sephora.com'
crawl_delay=6


DRIVER_PATH = '../../chromedriver_mac64/chromedriver'

In [83]:
def get_brand_name(soup):
    return soup.find("a", attrs={'data-at':"brand_name"}).text

def get_product_name(soup):
    return soup.find("span", attrs={'data-at':"product_name"}).text

def get_num_loves(soup):
    """
    """
    return soup.find("div", attrs={"data-comp": "LovesCount "}).span.text

def get_ingredients(soup):
    """
    """
    # inside of div in id='ingredients'
    return soup.find_all("div", {"id": "ingredients"})[0].text


def get_rating_data(soup):
    """
        difficult to get full histogram data
        can get summary stats
        
    """
    rr_container = soup.find("a", {"href": "#ratings-reviews-container"})
    star_rating = rr_container.find("span", {"data-comp":"StarRating "})['aria-label']
    num_reviews = rr_container.text
    return star_rating, num_reviews
    
def get_product_buttons(soup):
    """
    """
    size_options = []
    for prod_option in soup.find_all("div", attrs={"data-comp":"SwatchGroup "}):
        selected = {}
        selected["name"] = prod_option.find("p").text
        selected["size"] = prod_option.button()[0].text
        selected["class"] = prod_option.find("button")['class'][0]
        size_options.append(selected)
    return size_options

    
# def get_description(soup):
#     """
#     """
#     return soup.find('div', attrs={'class':'eanm77i0'})
        

def get_sku(soup):
    # need to add this in
    return soup.find('p', attrs={'data-at':'item-sku'}).text    


In [91]:
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
url = "https://www.sephora.com/ca/en/brands-list"
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# collecting brand names and links from brand list page 
brand_data = []
for brand_link in soup.findAll('a', attrs={"data-at": "brand_link"}):
    brand = {}
    brand['name'] = brand_link.span.text
    brand['link'] = brand_link.get('href') 
    brand_data.append(brand)

test_counter = 0
# for each brand, grab products on brand page, save products as list of links 
for brand in brand_data:
    url = base_url+brand['link']
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    product_tiles = soup.find_all(attrs={"data-comp":"ProductTile "})
    brand['products'] = [product.get("href").split(" ")[0] for product in product_tiles]
    time.sleep(crawl_delay)
    test_counter +=1
    if test_counter > 3:
        break


In [105]:
for brand in brand_data: 
    product_data = []    
    for url in brand["products"]:
        time.sleep(crawl_delay)

        # url = 'https://www.sephora.com/ca/en/product/charlotte-tilbury-airbrush-flawless-setting-spray-P461147?skuId=2368439&icid2=products%20grid:p461147:product'
        product = {}
        product["url"] = url

        # get class names of buttons and grab prices with selenium 
        driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        option_buttons = get_product_buttons(soup)
        for button_info in option_buttons:
            button_element = driver.find_element(By.XPATH, "//button[@class='"+button_info['class']+"']")
            button_element.click()
            button_info['price'] = driver.find_element(By.XPATH, "//b[@class='css-0']").text

        driver.quit()
        print(product)
        if soup.find_all('h1') is not None and soup.find('h1').text != 'Sorry, this product is not available.':
            product["product_name"] = get_product_name(soup)
            product["brand_name"] = get_brand_name(soup)
            product["options"] = option_buttons
            # product["description"] = get_description(soup)
            product["rating"], product["n_reviews"] = get_rating_data(soup)
            product["ingredients"] = get_ingredients(soup)
            product["n_loves"] = get_num_loves(soup)
            product_data.append(product)
            brand['product_data'] = product_data

{'url': 'https://www.sephora.com/ca/en/product/peonia-nobile-P413669?skuId=2219483&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/rosa-nobile-P388670?skuId=2044774&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/acqua-di-parma-sakura-P456549?skuId=2339703&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/blu-mediterraneo-fico-di-amalfi-P307801?skuId=2223428&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/acqua-di-parma-yuzu-P456550?skuId=2339687&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/adwoa-beauty-melon-berry-leave-in-conditioner-P481372?skuId=2538536&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/adwoa-beauty-melonberry-vitamin-c-frizz-fighting-hair-gel-P505476?skuId=2679439&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/adwoa-beauty-blue-tansy-leave-in-conditioning-styler-P474808?skuId=2466761&icid2=products'}
{'url': 'https://www.sephora.com/ca/en/product/adwoa-be

KeyError: 'products'

In [106]:
brand

{'name': 'ALTERNA Haircare', 'link': '/ca/en/brand/alterna'}

In [107]:
brand_data

[{'name': 'Acqua di Parma',
  'link': '/ca/en/brand/acqua-di-parma',
  'products': ['https://www.sephora.com/ca/en/product/peonia-nobile-P413669?skuId=2219483&icid2=products',
   'https://www.sephora.com/ca/en/product/rosa-nobile-P388670?skuId=2044774&icid2=products',
   'https://www.sephora.com/ca/en/product/acqua-di-parma-sakura-P456549?skuId=2339703&icid2=products',
   'https://www.sephora.com/ca/en/product/blu-mediterraneo-fico-di-amalfi-P307801?skuId=2223428&icid2=products',
   'https://www.sephora.com/ca/en/product/acqua-di-parma-yuzu-P456550?skuId=2339687&icid2=products'],
  'product_data': [{'url': 'https://www.sephora.com/ca/en/product/peonia-nobile-P413669?skuId=2219483&icid2=products',
    'product_name': 'Peonia Nobile',
    'brand_name': 'Acqua di Parma',
    'options': [{'name': 'Standard size',
      'size': '0.70oz/20mL Eau de Parfum Spray',
      'class': 'css-1sn75vo',
      'price': '$117.00'}],
    'rating': '4 stars',
    'n_reviews': '26',
    'ingredients': 'Alco