### Scraping

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
import random
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#### Get the links to the products

In [325]:
def get_links():
    time.sleep(1)
    hrefs = []
    links = driver.find_elements_by_css_selector('div.SkuGrid a')
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    links = driver.find_elements_by_css_selector('div.SkuGrid a')
    for link in links:
        url = link.get_attribute('href')
        hrefs.append(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    links = driver.find_elements_by_css_selector('div.SkuGrid a')
    for link in links:
        url = link.get_attribute('href')
        hrefs.append(url)
    return hrefs

In [294]:
start_urls = ['http://www.sephora.com/face-makeup?currentPage={}'.format(page) for page in range(1, 16)]

In [None]:
hrefs = []

In [None]:
for url in start_urls[2:]:
    driver.get(url)
    links = get_links()
    print(len(links))
    for link in links:
        if link not in hrefs:
            hrefs.append(link)
            print (link)

**remove duplicates**

In [361]:
hrefs = list(set(hrefs))

#### pickling

In [6]:
import pickle
import pandas as pd
import re

In [3]:
# pickle.dump(hrefs, open('links.pkl', 'wb'))
hrefs = pickle.load(open('data/links.pkl', 'rb'))

### Getting URLs for cheek, eye, and lip

In [353]:
def add_links(base_url, max_page):
    start_urls = [base_url + str(page) for page in range(1, max_page + 1)]
    for url in start_urls:
        driver.get(url)
        links = get_links()
        print(len(links))
        for link in links:
            if link not in hrefs:
                hrefs.append(link)
                print(link)

In [None]:
add_links('http://www.sephora.com/eye-makeup?currentPage=', 15)

In [None]:
add_links('http://www.sephora.com/cheek-makeup?currentPage=', 7)

In [None]:
add_links('http://www.sephora.com/lips-makeup?currentPage=', 9)

In [360]:
len(hrefs)

836

### Scraping Product

In [929]:
df_p = pd.DataFrame(columns=['product_id', 'product_name', 'brand_name', 'img_url',
                             'product_url',
                             'size', 'rev_count', 'love_count', 'categories', 'keywords', 'brand_info', 'use',
                             'ingredients', 'description', 'details'])

In [1025]:
df_c = pd.DataFrame(columns=['username', 'product_id', 'status', 'location',
                             'skin_tone', 'eye_color',
                             'age', 'title', 'body', 'helpful',
                             'not_helpful', 'star_count', 'date', ])

In [908]:
df_v = pd.DataFrame(columns=['product_id', 'finish',
                             'img_url', 'color'])

#### Helper Functions
    

In [8]:
def selenium_scrape(qry, obj, method='xpath', many=False, attribute=None):
    q = None
    if method == 'xpath':
        q = obj.find_elements_by_xpath(qry)
    elif method == 'css':
        q = obj.find_elements_by_css_selector(qry)
    
    if attribute == 'click':
        if q:
            q[0].click()
            return True
        else:
            return False
    if q:
        if attribute:
            if many:
                return [el.get_attribute(attribute) for el in q]
            return q[0].get_attribute(attribute)
        if many:
            return [el.text for el in q]
        return q[0].text
    return ''
def random_sleep(long=False):
    if long:
        time.sleep(10 + random.random())
    time.sleep(0.5 + random.random())
    

#### scraping

In [9]:
def scrape_item():
    item = {}
    item['categories'] = ';'.join(selenium_scrape('li.Breadcrumb-item', driver, 'css', True))
    item['keywords'] = selenium_scrape('//meta[@name="keywords"]', driver, attribute='content')
    item['brand_name'] = selenium_scrape('//meta[@property="product:brand"]', driver, attribute='content')
    item['product_name'] = selenium_scrape('//h1', driver).split('\n')[-1]
    item['rev_count'] = selenium_scrape('//a[@href="#pdp-reviews"]/span', driver)
    item['love_count'] = selenium_scrape('//span[@seph-love-count]', driver)
    item['product_id'] = selenium_scrape('//meta[@property="product:id"]', driver, attribute='content')
    item['size'] = selenium_scrape('div.InfoRow > span.InfoRow-size > span.InfoRow-value', driver, 'css')
    item['price'] = selenium_scrape('//meta[@property="product:price"]', driver, attribute='content')
    item['img_url'] = selenium_scrape('//meta[@property="og:image"]', driver, attribute='content')
    item['description'] = selenium_scrape('//meta[@property="og:description"]', driver, attribute='content')
    item['product_url'] = selenium_scrape('//meta[@property="og:url"]', driver, attribute='content')
    success = False
    while not success:
        try:
            if selenium_scrape('//span[contains(text(), "Details")]', driver, attribute='click'):
                item['details'] = selenium_scrape('div#details', driver, 'css')
            if selenium_scrape('//span[contains(text(), "Ingredients")]', driver, attribute='click'):
                item['ingredients'] = selenium_scrape('div#ingredients', driver, 'css')
            if selenium_scrape('//span[contains(text(), "How to Use")]', driver, attribute='click'):
                item['use'] = selenium_scrape('div#use', driver, 'css')
            if selenium_scrape('//span[contains(text(), "About the Brand")]', driver, attribute='click'): 
                item['brand_info'] = selenium_scrape('div#brand', driver, 'css')
            success = True
        except:
            print('Error')
    return item['product_id'], pd.DataFrame([item])

In [10]:
def scrape_swatches(p_id):
    data = []
    for f_group in driver.find_elements_by_css_selector('div.PdpLayout-main div.SwatchGroup'):
        finish = selenium_scrape('span', f_group, method='css')
        imgs = selenium_scrape('img.Swatch-img', f_group, method='css', many=True, attribute='src')
        start = df_v.shape[1]
        p_id = item['product_id']
        for img in imgs:
            data.append([p_id, finish, img, ''])
    return pd.DataFrame(data, columns=df_v.columns)

In [11]:
def scrape_comments(p_id):
    users = {}
    users['username'] = selenium_scrape('//span[@itemprop="author"]', driver, many=True)
    users['body'] = selenium_scrape('//div[@itemprop="description"]', driver, many=True)
    users['title'] = selenium_scrape('span.BVRRReviewTitle', driver, method='css', many=True)
    users['date'] = selenium_scrape('span.BVRRReviewDate', driver, method='css', many=True)
    users['not_helpful'] = selenium_scrape('span.BVDI_FVNegative span.BVDILinkSpan span.BVDINumber', driver, method='css', many=True)
    users['helpful'] = selenium_scrape('span.BVDI_FVPositive span.BVDILinkSpan span.BVDINumber', driver, method='css', many=True)
    users['star_count'] = selenium_scrape('#BVRRDisplayContentID div.BVRRRatingNormalImage > img', driver, method='css', many=True, attribute='alt')
    users['skin_tone'] = []
    users['eye_color'] = []
    users['status'] = []
    users['age'] = []
    users['location'] = []

    for el in driver.find_elements_by_css_selector('#BVRRDisplayContentID .BVRRReviewDisplayStyle3Summary'):
        # location
        location = ''
        if el.find_elements_by_css_selector('div.BVRRUserLocationContainer'):
            location = el.find_element_by_css_selector('span.BVRRUserLocation').text
        # skintone
        skintone = ''
        if el.find_elements_by_css_selector('div.BVRRContextDataValueskinToneContainer'):
            skintone = el.find_element_by_css_selector('span.BVRRContextDataValueskinTone').text
        # eye color 
        eye_color = ''
        if el.find_elements_by_css_selector('div.BVRRContextDataValueeyeColorContainer'):
            eye_color = el.find_element_by_css_selector('span.BVRRContextDataValueeyeColor').text
        # age
        age = ''
        if el.find_elements_by_css_selector('div.BVRRContextDataValueageContainer'):
            age = el.find_element_by_css_selector('span.BVRRContextDataValueage').text

        badges = ''
        if el.find_elements_by_css_selector('div.BVRRBadges'):
            badges = el.find_element_by_css_selector('div.BVRRBadges > div').get_attribute('class')
        users['skin_tone'].append(skintone)
        users['eye_color'].append(eye_color)
        users['status'].append(badges)
        users['age'].append(age)
        users['location'].append(location)
    users['product_id'] = [p_id] * len(users['username'])
    return pd.DataFrame(users)

In [20]:
for no, url in enumerate(hrefs):
    driver.get(url)
    random_sleep()
    print('scraping {}'.format(url))
    p_id, item = scrape_item()
    df_p = df_p.append(item)
    df_v = df_v.append(scrape_swatches(p_id))
    df_c = df_c.append(scrape_comments(p_id))
    last_page = selenium_scrape('.BVRRPageNumber', driver, method='css', many=True)
    comment_base_url = selenium_scrape('.BVRRPageLink a', driver, method='css', attribute='href')
    print(comment_base_url)
    if comment_base_url:
        last_page = int(last_page[-1])
    else:
        last_page = 1
    print('last page is {}'.format(last_page))
    comment_base_url = re.sub(r'page\=(\d+)', 'page={}', comment_base_url)
    for page in range(2, last_page + 1):
        print('scraping comment page {}/{}'.format(page, last_page))
        com_url = comment_base_url.format(page)
        com_page = requests.get(com_url)
        soup = BeautifulSoup(com_page.text, 'html.parser')
        df_c = df_c.append(scrape.scrape_users(soup, p_id))
    if no % 15 == 0:
        random_sleep(long=True)
        driver = webdriver.Chrome(chromedriver)
        save_progress()
        

scraping http://www.sephora.com/brow-powder-duo-P69300?skuId=929778&icid2=products%20grid:p69300
http://reviews.sephora.com/8723abredes/P69300/reviews.htm?format=embedded&page=2&scrollToTop=true
last page is 745
scraping comment page 2/745
scraping comment page 3/745
scraping comment page 4/745
scraping comment page 5/745
scraping comment page 6/745
scraping comment page 7/745
scraping comment page 8/745
scraping comment page 9/745
scraping comment page 10/745
scraping comment page 11/745
scraping comment page 12/745
scraping comment page 13/745
scraping comment page 14/745
scraping comment page 15/745
scraping comment page 16/745
scraping comment page 17/745
scraping comment page 18/745
scraping comment page 19/745
scraping comment page 20/745
scraping comment page 21/745
scraping comment page 22/745
scraping comment page 23/745
scraping comment page 24/745
scraping comment page 25/745
scraping comment page 26/745
scraping comment page 27/745
scraping comment page 28/745
scraping comm

NameError: name 'save_progress' is not defined

In [18]:
from libs import scrape
from bs4 import BeautifulSoup
import requests

### Save progress

In [21]:
def save_progress():
    df_c.to_csv('data/comments.csv', index=None)
    df_p.to_csv('data/products.csv', index=None)
    df_v.to_csv('data/variations.csv', index=None)

In [7]:
# df_c = pd.read_csv('data/comments.csv')
# df_p = pd.read_csv('data/products.csv')
# df_v = pd.read_csv('data/variations.csv')

In [22]:
save_progress()