In [200]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
import google_api_functions as gapi
import google_sheets_credentials as creds
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import copy
import concurrent.futures
import json
import datetime
import psycopg2
from psycopg2.errors import UniqueViolation
import config

In [186]:
import sys
sys.setrecursionlimit(10000)

In [38]:
#using the ids to create real urls
def create_url_df(session):
    all_url_info = {}
    #I'm pulling the list of urls straight from ulta's sidebar
    front_page = session.get('https://www.ulta.com/')
    front_page_soup = BeautifulSoup(front_page.text, features="lxml")
    #anchors = list of links in the side bar
    anchors = front_page_soup.find_all('a', {'class' : 'Anchor'})
    for anchor in anchors:
        #make sure there's a description; I'm getting the categories from the description
        if anchor.get('data-nav-description') is not None and re.search(r'[a-z]*:[a-z]*', anchor.get('data-nav-description')) is not None:
            #split up url path into pieces
            url_path = anchor.get('data-nav-description')[4:].split(':')
            #I do not want urls from these anchors
            if url_path[0] not in ['shop by brand', 'new arrivals', 'ulta beauty collection', 'gifts', 'sale & coupons', 'beauty tips'] and url_path[1] != 'featured':
                page = session.get(anchor.get('href'))
                soup = BeautifulSoup(page.text, features="lxml")
                #get the number of total products from each id so we can create a different url for each set of 500 products in the url so there isn't too much data loaded into one url at once
                num_results = int(re.findall(r'\b\d+\b', soup.find('h2', {'class' : 'search-res-title'}).find('span', {'class' : 'sr-only'}).text)[0])
                for i in range(math.ceil(num_results / 500)):
                    #creating a dictionary to have each url be linked to its id, main category, and sub category
                    url_info = {}
                    url_info['main_category'] = url_path[0]
                    url_info['sub_category'] = url_path[1]
                    if len(url_path) == 2: #if the length != 2 then the url path has at least 3 parts which means we can get a sub sub sub category from it 
                        url_info['sub_sub_category'] = ' '
                    else:
                        url_info['sub_sub_category'] = url_path[2]
                    #the &No= tag is the number of products on that page starting from 0 and &Nrpp=500 means there will be at most 500 products on each page
                    url = anchor.get('href') + '&No=' + str(i * 500) + '&Nrpp=500'
                    all_url_info[url] = url_info
    url_df = (
        pd.DataFrame.from_dict(all_url_info)
        .transpose()
        .reset_index()
        .rename(columns={'index' : 'url'})
        .rename_axis('url_pkey')
    )
    url_df.to_csv('data/url_df.csv')
    
def get_url_df(session):
    #getting the last modified date of my url_df.csv file
    last_mod_time = os.path.getmtime('data/url_df.csv')
    #getting number of days since last file modification date
    days_since_urls_update = (datetime.datetime.today() - datetime.datetime.fromtimestamp(last_mod_time)).days
    #if it has been at least 5 days since the last time the all_url_info_dict.json file was modified, then update
    if days_since_urls_update >= 5:
        create_url_df(session)
    #return url_df
    url_df = pd.read_csv('data/url_df.csv')
    return(url_df)

def scrape_url(session, products, row):
    #going to the url
    page = session.get(row['url'])
    #getting the page's content and using the package BeautifulSoup to extract data from it
    soup = BeautifulSoup(page.text, features="lxml")
    #each product on ulta's website has a container with the class "productQvContainer" so I'm getting every element that has that as a class to pull every product
    product_containers = soup.find_all('div', {'class' : 'productQvContainer'})
    #applying the function get_single_product for each product in the url. if it throws an exception, I'm having it print the url and index so I can tell what product is having a problem.
    for product_container in product_containers:
        try:
            product, product_id = get_single_product(soup, product_container, row.name)
            products[product_id] = product
        except Exception as exc:
            print(row['url'], product_containers.index(product_container))
            print(exc, '\n')
    return(products)

In [None]:
def get_single_product(soup, product_container, url_pkey):
    product = {}
    #get general product data from each product
    product_id = product_container.find('span', {'class' : 'prod-id'}).text.strip()
    product['sku_id'] = str(product_container.find('a', {'class' : 'qShopbutton'}).get('data-skuidrr'))
    product['brand'] = product_container.find('h4', {'class' : 'prod-title'}).text.strip()
    #description is the name of the product. so if there's a product called "ULTA Fabulous Concealer", "ULTA" would be the brand and "Fabulous Concealer" would be the description.
    product['product'] = product_container.find('p', {'class' : 'prod-desc'}).text.strip()
    #sometimes the https://www.ulta.com is already in the url and sometimes (most of the time) it's not.
    if product_container.find('a', {'class' : 'product'}).get('href')[0] != '/':
        product_url = product_container.find('a', {'class' : 'product'}).get('href')
    else:
        product_url = 'https://www.ulta.com' + product_container.find('a', {'class' : 'product'}).get('href')
    #if the correct product id isn't in the url then the url is wrong. if it's wrong, then we need to fix it.
    if product_url.split('productId=')[1] != product_id:
        product_url = 'https://www.ulta.com/' + product['product'].replace(' ', '-').lower() + '?productId=' + product_id
    product['url'] = product_url
    #getting the rating information for each product; using if statements in case a product doesn't have a rating for whatever reason
    if product_container.find('label', {'class' : 'sr-only'}) is not None:
        rating = product_container.find('label', {'class' : 'sr-only'}).text.split(' ')[0]
        if rating == 'Price':
            rating = 0.00
        product['rating'] = rating
    if product_container.find('span', {'class' : 'prodCellReview'}) is not None:
        product['no_of_reviews'] = re.findall(r'\b\d+\b', product_container.find('span', {'class' : 'prodCellReview'}).text)[0]
    #the prices are labeled differently in the code depending on whether the product is for sale or not (for sale as in marked as sale not a secret sale)
    if product_container.find('div', {'class' : 'productSale'}) is None:
        product['sale'] = 0
        product['price'] = product_container.find('span', {'class' : 'regPrice'}).text.strip()
    else:
        product['sale'] = 1
        product['price'] = product_container.find('span', {'class' : 'pro-old-price'}).text.strip()
        product['sale_price'] = product_container.find('span', {'class' : 'pro-new-price'}).text.strip()
    #getting the available offers and number of options/colors of the product if they're listed
    if product_container.find('div', {'class' : 'product-detail-offers'}) is not None:
        product['offers'] = product_container.find('div', {'class' : 'product-detail-offers'}).text.strip()
    if product_container.find('span', {'class' : 'pcViewMore'}) is not None:
        product['options'] = re.sub('\xa0', ' ', product_container.find('span', {'class' : 'pcViewMore'}).text.strip())
    product['url_pkey_foreign'] = url_pkey
    return(product, product_id)

In [269]:
def get_product_in_stock(product_id, url):
    #start = time.perf_counter()
    #chrome_options = Options()
    #chrome_options.add_argument("--headless")
    #with webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe', options = chrome_options) as driver:
    with webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe') as driver:
        product_in_stock = {}
        variants_in_stock = {}
        wait = WebDriverWait(driver, 60)
        #opening product url in the driver/browser
        driver.get(url)
        #if the product doesn't exist anymore ulta wil take you to this site
        if driver.current_url == 'https://www.ulta.com/404.jsp':
            next
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ProductSwatchImage__variantHolder')))
        element = driver.find_element_by_class_name('Reviews__container--list')
        wait.until(EC.visibility_of(element))
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants = driver.find_elements_by_class_name('ProductDetail__productSwatches')
        print(len(product_variants))
        #getting all the product variants from the page
        #product_variants = get_product_variants(driver)
        for product_variant in product_variants:
            element_pic = driver.find_element_by_class_name("ProductDetail__productImage")
            try:
                product_variant.click() #clicking on each variant at a time to get their price and availability
            except:         
                next #if I can't click on it I want to go to the next variant
            else:
                wait.until(EC.visibility_of(element_pic))
                #creating a BeautifulSoup object to extract data
                soup = BeautifulSoup(driver.page_source, features="lxml")
                #there are products that only a couple of shades are labeled as sale so I'm removing those to make sure no sale items slip through
                if soup.find('img', {'src' : 'https://images.ulta.com/is/image/Ulta/badge-sale?fmt=png-alpha'}) is not None:
                    next
                #getting price
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                option = ulta.get_option(soup)
                #only adding the product variant if it's available
                if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                    variants_in_stock[option] = price
        product_in_stock[product_id] = variants_in_stock
    finish = time.perf_counter()
    print(len(product_in_stock))
    print(f'{product_id} Finished in {round(finish-start, 2)} second(s)')
    return(product_in_stock)

In [271]:
#product variant page doesn't finish loading and variant information is not extracted


In [242]:
def get_product_variants(driver):
    product_variants1 = 0
    product_variants2 = -1
    while(product_variants1 != product_variants2):
        product_variants1 = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants1) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants1 = driver.find_elements_by_class_name('ProductDetail__productSwatches')    
        product_variants2 = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants2) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants2 = driver.find_elements_by_class_name('ProductDetail__productSwatches')
            print(len(product_variants1), len(product_variants2))
    return(product_variants1)

In [243]:
import itertools

In [244]:
products_t = dict(itertools.islice(products.items(), 6))

In [260]:
for key in products_t.keys():
    try:
        result = get_product_in_stock(key, products[key]['url'])
        print(result, '\n')
    except:
        print(key)

4
xlsImpprod5770263
44
xlsImpprod15711051
xlsImpprod14491009
2
xlsImpprod3590053
5
xlsImpprod10791925
0
1
xlsImpprod12011171 Finished in 9.88 second(s)
{'xlsImpprod12011171': {}} 



In [270]:
get_product_in_stock('xlsImpprod10791925', 'https://www.ulta.com/redness-solutions-makeup-broad-spectrum-spf-15-with-probiotic-technology?productId=xlsImpprod10791925')

5
1
xlsImpprod10791925 Finished in 146620.03 second(s)


{'xlsImpprod10791925': {'Calming Fair (very fair, cool-neutral undertones)': '31.00',
  'Calming Ivory (very fair, cool-neutral undertones)': '31.00'}}

In [56]:
def scrape_url2(row):
    products = {}
    #going to the url
    page = requests.get(row['url'])
    #getting the page's content and using the package BeautifulSoup to extract data from it
    soup = BeautifulSoup(page.text, features="lxml")
    #each product on ulta's website has a container with the class "productQvContainer" so I'm getting every element that has that as a class to pull every product
    product_containers = soup.find_all('div', {'class' : 'productQvContainer'})
    #applying the function get_single_product for each product in the url. if it throws an exception, I'm having it print the url and index so I can tell what product is having a problem.
    for product_container in product_containers:
        try:
            product, product_id = get_single_product(soup, product_container, row.name)
            products[product_id] = product
        except Exception as exc:
            print(row['url'], product_containers.index(product_container))
            print(exc, '\n')
    products_df = (
        pd.DataFrame.from_dict(products)
        .transpose()
    )
    return(products_df)

In [51]:
def get_soup(row):
    products = {}
    #going to the url
    page = requests.get(row['url'])
    #getting the page's content and using the package BeautifulSoup to extract data from it
    soup = BeautifulSoup(page.text, features="lxml")
    time.sleep(10)
    return(soup)

# step 1: scrape current data from ulta.com

In [60]:
import threading
import multiprocessing

In [23]:
session = requests.Session()
url_df = get_url_df(session).set_index('url_pkey')

In [167]:
url_df_t = url_df[0:2]

In [168]:
rows = []
for index, row in url_df_t.iterrows():
    rows.append(row)

In [169]:
start = time.perf_counter()
for index, row in url_df_t.iterrows():
    t = threading.Thread(target=get_soup, args=[row])
    t.start()

In [170]:
end = time.perf_counter()
print(end - start)

0.18887959999847226


In [182]:
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(get_soup, rows)

In [183]:
end = time.perf_counter()
print(end - start)

13.551527000003261


In [173]:
start = time.perf_counter()
for index, row in url_df_t.iterrows():
    p = multiprocessing.Process(target=get_soup, args=[row])
    p.start()

In [174]:
end = time.perf_counter()
print(end - start)

0.1754369999980554


In [178]:
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(get_soup, rows)

In [179]:
end = time.perf_counter()
print(end - start)

0.6237601999891922


In [156]:
def do_something(seconds):
    print('sleeping...')
    time.sleep(seconds)
    return(seconds)

In [133]:
start = time.perf_counter()
for i in [5, 4, 3, 2, 1]:
    t = threading.Thread(target=do_something, args=[i])
    t.start()

sleeping...
sleeping...sleeping...

sleeping...
sleeping...


In [134]:
end = time.perf_counter()
print(end - start)

0.05658979999134317


In [135]:
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor() as executor:
    secs = [5, 4, 3, 2, 1]
    results = executor.map(do_something, secs)

sleeping...
sleeping...
sleeping...
sleeping...
sleeping...


In [136]:
end = time.perf_counter()
print(end - start)

5.0343901999876834


In [137]:
start = time.perf_counter()
for i in [5, 4, 3, 2, 1]:
    p = multiprocessing.Process(target=do_something, args=[i])
    p.start()

In [138]:
end = time.perf_counter()
print(end - start)

0.19040530000347644


In [139]:
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(do_something, secs)

In [140]:
end = time.perf_counter()
print(end - start)

0.778286200016737


In [188]:
current_ulta_df = (
    pd.DataFrame.from_dict(products)
    .transpose()
    .rename_axis('product_id')
)

In [191]:
current_ulta_df.to_csv('data/current_ulta_df.csv')

### dealing with sale prices

if there's a sale price, that's the price the product is currently being sold for, so I want the price of the product in the database to be the sale price. so if there's a sale price I'm replacing the price with the sale price.

In [None]:
def fix_price(row):
    if row['sale'] == 0:
        val = row['price']
    else:
        val = row['sale_price']
    return val

In [None]:
current_ulta_df['price_str'] = current_ulta_df.apply(fix_price, axis=1)
current_ulta_df['date'] = [datetime.date.today().strftime('%Y-%m-%d')] * len(current_ulta_df)
current_ulta_df = (
    current_ulta_df
    .drop(columns={'sale_price'})
    .fillna(value={'options' : ''})
)

In [None]:
current_ulta_df

# step 2: find secret sales

### connect to database

In [None]:
params = config.config()
conn = psycopg2.connect(**params)

In [None]:
cur = conn.cursor()

In [None]:
query = """
    SELECT DISTINCT
        product.product_id, tbl.max_date, price.price_str, price.sale
    FROM (
        SELECT 
            product_pkey_foreign, MAX(price_entry_date) as max_date
        FROM 
            price
        GROUP BY 
            product_pkey_foreign
        ) tbl
    LEFT JOIN 
        product
    ON
        tbl.product_pkey_foreign = product.product_pkey
    INNER JOIN
        price
    ON
        tbl.product_pkey_foreign = price.product_pkey_foreign and tbl.max_date = price.price_entry_date
"""

In [None]:
cur.execute(query)

In [None]:
r = cur.fetchall()

In [None]:
cur.close()

In [None]:
db_dat = pd.DataFrame(r, columns=['product_id', 'recent_date', 'recent_price_str', 'recent_sale'])

In [None]:
merged_dat = (
    current_ulta_df
    .pipe(pd.merge, db_dat, on='product_id', how='left')
)

In [None]:
(
    merged_dat
    .query("price != price_str")
    .loc
)

In [None]:
merged_dat

In [None]:
db_dat = pd.DataFrame(r, columns=['product_id', 'rating', 'no_of_reviews', 'offers', 'max_date', 'price_str', 'max_price', 'option'])

latest_ulta_dat = (
    db_dat
    .drop(columns={'max_price', 'option'})
    .pipe(pd.DataFrame.drop_duplicates)
    .sort_values(['product_id', 'max_date'])
    .pipe(pd.DataFrame.drop_duplicates, 'product_id', keep='last')
    .set_index('product_id')
)

In [None]:
latest_ulta_dat

In [None]:
merged_dat = (
    current_ulta_df
    .pipe(pd.merge, latest_ulta_dat, on='product_id', how='left')
)

new = []
for i in range(len(merged_dat)):
    if merged_dat.iloc[i]['price_y']

In [None]:
merged_dat.iloc[16879]

In [None]:
db_dat.query("product_id == 'pimprod2006617'")

In [None]:
not_in_db = (
    current_ulta_df
    .pipe(pd.merge, db_dat, on='product_id', how='left')
    .dropna(subset=['price_str'])
    .query('price != price_str | rating_x != rating_y')
    .drop(columns={'rating_y', 'no_of_reviews_y', 'offers_y', 'max_date', 'price_str'})
    .rename(columns={'rating_x' : 'rating', 'no_of_reviews_x' : 'no_of_reviews', 'offers_x' : 'offers'})
)

In [None]:
#if you make a mistake, execute following code afterwards
#cur.execute("rollback;")

In [None]:
conn.close()