In [1]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
import google_api_functions as gapi
import google_sheets_credentials as creds
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import copy
import concurrent.futures

In [2]:
print("\nstarting...\n")

session = requests.Session()
all_url_info = {}
products = {}


starting...



In [3]:
all_url_info = ulta.get_url_dict(session)
urls = list(all_url_info.keys())

In [4]:
print('scraping ulta...')
#I'm using threading to make the code run faster
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(ulta.scrape_url, url, session, products, all_url_info): url for url in urls}
    for future in concurrent.futures.as_completed(futures):
        url = futures[future]
        try:
            data = future.result()
        except Exception as exc:
            print(url, ':', exc)
        else:
            products = data

scraping ulta...
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 198
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 128
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 198
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 128
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 198
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 128
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 128
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 198
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 128
https://www.ulta.com/skin-care-cleansers-makeup-remover?N=27gx&No=0&Nrpp=500 : name 'exc' is not defined
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 198
https://www.ulta.com/makeup-face?N=26y3&No=1500&Nrpp=500 : name 'exc' is not defined
https://www.ulta.com/skin-care-cleansers?N=2794&No=1000&Nrpp=500 7
https://www.ulta

In [6]:
len(products)

16631

In [None]:
print('data cleaning...')
#cleaning the data
#df of every product on ulta's website
ulta_df = pd.DataFrame.from_dict(products).transpose()
#df of only the secret sales
secret_sales_df = copy.deepcopy(ulta_df.query('secret_sale == 1 & sale == 0'))
#I'm dropping the columns that have NA in in the options column so I know which products have multiple options so I can 
#check which of those options are in stock
drop_na_options = pd.DataFrame.to_dict(secret_sales_df.dropna(subset=['options']).transpose())
#a couple of the items had an incorrect url for some reason so I turned secret_sales back into a dictionary so that,
#when I need to update a product's url, it's easier for me to do so. you could probably do this using the dataframe 
#instead but I didn't feel like it
secret_sales = pd.DataFrame.to_dict(secret_sales_df.transpose())

In [None]:
driver.find_element_by_xpath("//*[@id='js-mobileHeader']/div/div/div/div/nav/div/div/div[1]/div/div/div/div[1]/form/button").click()

In [None]:
driver.current_url == 'https://www.ulta.com/404.jsp'

In [None]:
def get_products_in_stock(secret_sales, driver):
    products_in_stock = {}
    for product in secret_sales:
        variants_in_stock = {}
        temp = {}
        #opening product url in the driver/browser
        driver.get(secret_sales[product]['url'])
        #making sure that the url is correct! it wasn't for a couple of the products for some reason idk why. but I'm 
        #fixing it in this step.
        if driver.current_url == 'https://www.ulta.com/404.jsp':
            next
        elif driver.current_url.split('productId=')[1] != secret_sales[product]['id']:
            driver.find_element_by_xpath("//*[@id='navigation__wrapper--sticky']/div/div[1]/div[2]/div/a").click()
            driver.find_element_by_xpath("//*[@id='searchInput']").send_keys(secret_sales[product]['id'])
            driver.find_element_by_xpath("//*[@id='js-mobileHeader']/div/div/div/div[1]/div/div[1]/form/button").click()
            if driver.current_url == 'https://www.ulta.com/404.jsp':
                next
            elif driver.current_url.split('productId=')[1] == secret_sales[product]['id']:
                secret_sales[product]['url'] = driver.current_url
        #if I don't add this sleep, the page doesn't finish loading. tried to use implicit waits but this just worked better.
        time.sleep(1)
        #getting all the product variants from the page
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        for product_variant in product_variants:
            try:
                #clicking on each variant at a time to get their price and availability
                product_variant.click()
            except:
                #if I can't click on it I want to go to the next variant
                next
            else:
                #if I don't add this sleep, the page doesn't finish loading. tried to use implicit waits but this just worked better.
                time.sleep(1)
                #creating a BeautifulSoup object to extract data
                soup = BeautifulSoup(driver.page_source, features="lxml")
                #getting price
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                #only getting other information if it's a secret sale item
                if price.endswith('.97'):
                    #color and size are in different locations
                    #getting color
                    option = soup.find('meta', {'property' : 'product:color'}).get('content')
                    #if there's no color, checking if there's a size
                    if option == '':
                        option_tag = soup.find('div', {'class' : 'ProductDetail__colorPanel'}).find_all('span')[1]
                        if option_tag is not None:
                            option = option_tag.text
                    #if there's no color or size I'm putting 'NA' to represent that there's still a swatch there even if we can't find
                    #information about it. like 99.99% of the time this shouldn't happen but just in case.
                    if option == '':
                        option = 'NA'
                    #only adding the product variant if it's available
                    if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                        temp[option] = price
        #checking if the temp dictionary is empty to make sure if there are indeed product variants in stock
        if bool(temp):
            #rearranging the dictionary to group variants with the same size together and putting the different options in a single string
            #so that, in the end, for each product, there is a dictionary including the different price options and, for each price option, 
            #a string containing the options (colors, sizes) available for that price point. 
            for key, value in temp.items():
                variants_in_stock.setdefault(value, set()).add(key)
            for key, value in variants_in_stock.items():
                new_value = ", ".join(value)
                variants_in_stock[key] = new_value
            products_in_stock[driver.title[:-14]] = variants_in_stock
        else:
            #if there aren't any product variants in stock, I don't want them in the document
            next
    return(products_in_stock, secret_sales)

In [None]:
#chrome_options = Options()  
#chrome_options.add_argument("--headless")  
driver = webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe')
products_in_stock = {}

In [None]:
products_in_stock, secret_sales = get_products_in_stock(secret_sales, driver)

In [None]:
driver.close()
driver.quit()

In [None]:
products_in_stock

In [None]:
addFilterViewRequest = {
    'addFilterView': {
        'filter': {
            'title': 'sale_filter',
            'range': my_range
        }
    }
}