# WebScraping-Sephora: Step 2. Get Product Reviews
NYCDSA web scraping project

---
## Project Description
The goal of this project is to explore the color spectrum of the foundations and lipsticks given reviewer's dominant colors (hair color, eye color, and skin tone from Sephora's reviewer inputs) to see if particular features are strongly correlated between the purchased and liked foundation and lipstick colors.

Please see Readme.md for more information including the Repository layout.


### Project Outline
- Step 1. Scrape product URLs
- Step 2. Scrape product reviews
- Step 3. Load all data and explore data

---
### Step 2. Get Product Reviews
Product information needed to pull review JSON from Bazaarvoice were collected from the product page using selenium. Then relevant information was spliced and saved into data frames and then to csv per product.

In [None]:
import time
import json
import math
import pandas as pd
import numpy as np
from PIL import Image
from urllib.request import urlopen
from urllib.request import Request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

## Define webdriver and its options
opts = Options()
opts.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
driver = webdriver.Chrome("/usr/local/bin/chromedriver", chrome_options=opts)
timeout = 20

## Import URL directory
df_urls = pd.read_csv('./data/product_urls.csv')

## Scrape each product's URLs and its reviews  
i_start = 0
n_i=list(range(i_start,i_start+100))   # scrape 100 at a time
for i in n_i: 
    # Load URL
    category = df_urls['Category'][i]
    product = df_urls['Product'][i]
    url = df_urls['URL'][i]
    driver.get(url)
    time.sleep(3)
    
    ## Extract product ID
    p_id = driver.find_element_by_xpath('//*[@data-at="sku_size"]').text.split()[-1]
    
    ## Extract total number of reviews
    # wait until Review contents load and scroll to reviews
    wait_driver = WebDriverWait(driver, timeout)
    section_title = wait_driver.until(EC.presence_of_element_located(\
            (By.XPATH, '//*[@id="ratings-reviews-container"]')))
    driver.execute_script("arguments[0].scrollIntoView();", section_title)
    price = driver.find_element_by_xpath('//*[@data-at="price"]').text
    
    wait_driver.until(EC.presence_of_element_located(\
            (By.XPATH, '//*[@class="css-960eb6"]')))
    good_reviews = driver.find_element_by_xpath('//*[@class="css-960eb6"]').text.split()[2:6:3]

    n_reviews = int(good_reviews[0])+int(good_reviews[1])
    n_iter = math.ceil(n_reviews/100)
    
    df_reviews = pd.DataFrame(columns = ['Category','Product','Price','UserName','UserID',\
                                             'Rating','Eyecolor','Haircolor','Skintone',\
                                                 'SwatchID','R','G','B'])
    
    # Scrape Sephora review data from bazaarvoice.com
    for ii in range(n_iter):   
        reviewURL = 'https://api.bazaarvoice.com/data/reviews.json?' + \
                     'apiversion=5.4' + \
                     '&passkey=rwbw526r2e7spptqd2qzbkp7' + \
                     '&Filter=ProductId:' + p_id + \
                     '&Sort=Rating:desc' + \
                     '&Limit=100' + \
                     '&Offset=' + str(ii*100)    
        json_url = urlopen(reviewURL)
        data = json.loads(json_url.read())
        
        # check the scraping progress
        print(reviewURL)
        
        reviews = data['Results']
        for review in reviews:
            
            # load swatch image to extract the color parameters
            if (len(review['ProductId']) > 0) & (len(review['ContextDataValues']) > 0):
                # assign dummy RGB to fill later
                RGB = [0,0,0]
            
                # arrange values into dataframe
                df_review = {}
                df_review['Category'] = category
                df_review['Product'] = product
                df_review['Price'] = float(price[1:])
                df_review['UserName'] = review['UserNickname']
                df_review['UserID'] = review['AuthorId']
                df_review['Rating'] = review['Rating']
            
                try:
                    df_review['Eyecolor'] = review['ContextDataValues']['eyeColor']['Value']
                except:
                    df_review['Eyecolor'] = None
                
                try:
                    df_review['Haircolor'] = review['ContextDataValues']['hairColor']['Value']
                except:
                    df_review['Haircolor'] = None
            
                try:
                    df_review['Skintone'] = review['ContextDataValues']['skinTone']['Value']
                except:
                    df_review['Skintone'] = None
                
                df_review['SwatchID'] = review['ProductId']
                df_review['R'] = RGB[0]
                df_review['G'] = RGB[1]
                df_review['B'] = RGB[2]
            
                df_reviews = df_reviews.append(df_review,ignore_index=True)
            
            else: pass
        # end of for loop
    # end of for loop  
    
    # Extract average RGB vectors from the swatches in the reviews
    swatches = sorted(list(set(df_reviews['SwatchID'])))
    for swatch in swatches:
        img_url = 'https://www.sephora.com/productimages/sku/s' + swatch + '+sw.jpg'
        req = Request(img_url, headers={'User-Agent': 'Mozilla/5.0'})
        try:
            with urlopen(req) as im_url:
                with open('temp.jpg', 'wb') as f:
                    f.write(im_url.read())        
            img = Image.open('temp.jpg')
            pix_val = list(img.getdata())
            RGB = np.mean(np.array(pix_val),axis=0)
        except:
            RGB = [0,0,0]
        
        df_reviews['R'][df_reviews['SwatchID'] == swatch] = RGB[0]
        df_reviews['G'][df_reviews['SwatchID'] == swatch] = RGB[1]
        df_reviews['B'][df_reviews['SwatchID'] == swatch] = RGB[2]
    
    df_reviews = df_reviews[df_reviews.R>0]     # store only if RGB vector is nonzero
    df_reviews.to_csv('./data/product_info_'+str(i)+'.csv', index = False)
# end of for loop

driver.close()