In [1]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
import google_api_functions as gapi
import google_sheets_credentials as creds
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import copy
import concurrent.futures
import json
import datetime
#import psycopg2
#from psycopg2.errors import UniqueViolation
import config

In [2]:
import sys
sys.setrecursionlimit(10000)

In [5]:
def get_product_in_stock(product_id, url):
    #start = time.perf_counter()
    #chrome_options = Options()
    #chrome_options.add_argument("--headless")
    #with webdriver.Chrome(r'/home/lermane/Downloads/chromedriver_linux64/chromedriver', options = chrome_options) as driver:
    with webdriver.Chrome(r'/home/lermane/Downloads/chromedriver_linux64/chromedriver') as driver:
        product_in_stock = {}
        variants_in_stock = {}
        wait = WebDriverWait(driver, 60)
        #opening product url in the driver/browser
        driver.get(url)
        #if the product doesn't exist anymore ulta wil take you to this site
        if driver.current_url == 'https://www.ulta.com/404.jsp':
            next
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ProductSwatchImage__variantHolder')))
        element = driver.find_element_by_class_name('Reviews__container--list')
        wait.until(EC.visibility_of(element))
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants = driver.find_elements_by_class_name('ProductDetail__productSwatches')
        print(len(product_variants))
        #getting all the product variants from the page
        #product_variants = get_product_variants(driver)
        for product_variant in product_variants:
            element_pic = driver.find_element_by_class_name("ProductDetail__productImage")
            try:
                product_variant.click() #clicking on each variant at a time to get their price and availability
            except:         
                next #if I can't click on it I want to go to the next variant
            else:
                wait.until(EC.visibility_of(element_pic))
                #creating a BeautifulSoup object to extract data
                soup = BeautifulSoup(driver.page_source, features="lxml")
                #there are products that only a couple of shades are labeled as sale so I'm removing those to make sure no sale items slip through
                if soup.find('img', {'src' : 'https://images.ulta.com/is/image/Ulta/badge-sale?fmt=png-alpha'}) is not None:
                    next
                #getting price
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                option = ulta.get_option(soup)
                #only adding the product variant if it's available
                if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                    variants_in_stock[option] = price
        product_in_stock[product_id] = variants_in_stock
    finish = time.perf_counter()
    print(len(product_in_stock))
    print(f'{product_id} Finished in {round(finish-start, 2)} second(s)')
    return(product_in_stock)

In [271]:
#product variant page doesn't finish loading and variant information is not extracted


In [242]:
def get_product_variants(driver):
    product_variants1 = 0
    product_variants2 = -1
    while(product_variants1 != product_variants2):
        product_variants1 = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants1) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants1 = driver.find_elements_by_class_name('ProductDetail__productSwatches')    
        product_variants2 = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants2) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants2 = driver.find_elements_by_class_name('ProductDetail__productSwatches')
            print(len(product_variants1), len(product_variants2))
    return(product_variants1)

In [3]:
import itertools

In [244]:
products_t = dict(itertools.islice(products.items(), 6))

In [260]:
for key in products_t.keys():
    try:
        result = get_product_in_stock(key, products[key]['url'])
        print(result, '\n')
    except:
        print(key)

4
xlsImpprod5770263
44
xlsImpprod15711051
xlsImpprod14491009
2
xlsImpprod3590053
5
xlsImpprod10791925
0
1
xlsImpprod12011171 Finished in 9.88 second(s)
{'xlsImpprod12011171': {}} 



In [270]:
get_product_in_stock('xlsImpprod10791925', 'https://www.ulta.com/redness-solutions-makeup-broad-spectrum-spf-15-with-probiotic-technology?productId=xlsImpprod10791925')

5
1
xlsImpprod10791925 Finished in 146620.03 second(s)


{'xlsImpprod10791925': {'Calming Fair (very fair, cool-neutral undertones)': '31.00',
  'Calming Ivory (very fair, cool-neutral undertones)': '31.00'}}

In [56]:
def scrape_url2(row):
    products = {}
    #going to the url
    page = requests.get(row['url'])
    #getting the page's content and using the package BeautifulSoup to extract data from it
    soup = BeautifulSoup(page.text, features="lxml")
    #each product on ulta's website has a container with the class "productQvContainer" so I'm getting every element that has that as a class to pull every product
    product_containers = soup.find_all('div', {'class' : 'productQvContainer'})
    #applying the function get_single_product for each product in the url. if it throws an exception, I'm having it print the url and index so I can tell what product is having a problem.
    for product_container in product_containers:
        try:
            product, product_id = get_single_product(soup, product_container, row.name)
            products[product_id] = product
        except Exception as exc:
            print(row['url'], product_containers.index(product_container))
            print(exc, '\n')
    products_df = (
        pd.DataFrame.from_dict(products)
        .transpose()
    )
    return(products_df)

In [51]:
def get_soup(row):
    products = {}
    #going to the url
    page = requests.get(row['url'])
    #getting the page's content and using the package BeautifulSoup to extract data from it
    soup = BeautifulSoup(page.text, features="lxml")
    time.sleep(10)
    return(soup)

# step 1: scrape current data from ulta.com

In [9]:
current_ulta_df = pd.read_csv('data/current_ulta_df.csv').set_index('product_id')

### dealing with sale prices

if there's a sale price, that's the price the product is currently being sold for, so I want the price of the product in the database to be the sale price. so if there's a sale price I'm replacing the price with the sale price.

In [11]:
def fix_price(row):
    if row['sale'] == 0:
        val = row['price']
    else:
        val = row['sale_price']
    return val

In [12]:
current_ulta_df['price_str'] = current_ulta_df.apply(fix_price, axis=1)
current_ulta_df['date'] = [datetime.date.today().strftime('%Y-%m-%d')] * len(current_ulta_df)
current_ulta_df = (
    current_ulta_df
    .drop(columns={'sale_price'})
    .fillna(value={'options' : ''})
)

In [13]:
current_ulta_df

Unnamed: 0_level_0,sku_id,brand,product,url,rating,no_of_reviews,sale,price,offers,options,url_pkey_foreign,price_str,date
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
xlsImpprod5770263,2264077,It Cosmetics,Bye Bye Redness Neutralizing Color-Correcting ...,https://www.ulta.com/bye-bye-redness-neutraliz...,4.50,2859.0,0,$34.00,Free Gift with Purchase!,3 Colors,0,$34.00,2021-05-02
xlsImpprod15711051,2540112,L.A. Girl,HD Pro Concealer,https://www.ulta.com/hd-pro-concealer?productI...,4.10,1715.0,0,$4.99,"Buy 2, get 1 FREE - Add 3 items to qualify!",43 Colors,0,$4.99,2021-05-02
xlsImpprod14491009,2503509,Urban Decay Cosmetics,Naked Skin Color Correcting Fluid,https://www.ulta.com/naked-skin-color-correcti...,4.40,844.0,0,$29.00,Free Gift with Purchase!,3 Colors,0,$29.00,2021-05-02
xlsImpprod3590053,2233970,Smashbox,Photo Finish Reduce Redness Primer,https://www.ulta.com/photo-finish-reduce-redne...,4.10,854.0,0,$15.00 - $39.00,,2 Sizes,0,$15.00 - $39.00,2021-05-02
xlsImpprod10791925,2222453,Clinique,Redness Solutions Makeup Broad Spectrum SPF 15...,https://www.ulta.com/redness-solutions-makeup-...,4.60,193.0,0,$31.00,Free Gift with Purchase!,4 Colors,0,$31.00,2021-05-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
xlsImpprod18411062,2524747,Redken,Travel Size Brews Grip Tight Holding Gel,https://www.ulta.com/travel-size-brews-grip-ti...,0.00,,0,$8.00,,,222,$8.00,2021-05-02
xlsImpprod17361149,2520342,Every Man Jack,Cedarwood Grooming Paste,https://www.ulta.com/cedarwood-grooming-paste?...,0.00,,0,$9.99,,,222,$9.99,2021-05-02
pimprod2016011,2563297,Frederick Benjamin,Crown Control Forming Creme,https://www.ulta.com/crown-control-forming-cre...,0.0,,0,$16.00,,,222,$16.00,2021-05-02
xlsImpprod18241089,2525333,American Crew,Techseries Boost Spray,https://www.ulta.com/techseries-boost-spray?pr...,0.0,,0,$17.95,"Buy 1, get 1 at 50% off!",,222,$17.95,2021-05-02


# step 2: find secret sales

### connect to database

In [None]:
params = config.config()
conn = psycopg2.connect(**params)

In [None]:
cur = conn.cursor()

In [None]:
query = """
    SELECT DISTINCT
        product.product_id, tbl.max_date, price.price_str, price.sale
    FROM (
        SELECT 
            product_pkey_foreign, MAX(price_entry_date) as max_date
        FROM 
            price
        GROUP BY 
            product_pkey_foreign
        ) tbl
    LEFT JOIN 
        product
    ON
        tbl.product_pkey_foreign = product.product_pkey
    INNER JOIN
        price
    ON
        tbl.product_pkey_foreign = price.product_pkey_foreign and tbl.max_date = price.price_entry_date
"""

In [None]:
cur.execute(query)

In [None]:
r = cur.fetchall()

In [None]:
cur.close()

In [None]:
db_dat = pd.DataFrame(r, columns=['product_id', 'recent_date', 'recent_price_str', 'recent_sale'])

In [None]:
merged_dat = (
    current_ulta_df
    .pipe(pd.merge, db_dat, on='product_id', how='left')
)

In [None]:
(
    merged_dat
    .query("price != price_str")
    .loc
)

In [None]:
merged_dat

In [None]:
db_dat = pd.DataFrame(r, columns=['product_id', 'rating', 'no_of_reviews', 'offers', 'max_date', 'price_str', 'max_price', 'option'])

latest_ulta_dat = (
    db_dat
    .drop(columns={'max_price', 'option'})
    .pipe(pd.DataFrame.drop_duplicates)
    .sort_values(['product_id', 'max_date'])
    .pipe(pd.DataFrame.drop_duplicates, 'product_id', keep='last')
    .set_index('product_id')
)

In [None]:
latest_ulta_dat

In [None]:
merged_dat = (
    current_ulta_df
    .pipe(pd.merge, latest_ulta_dat, on='product_id', how='left')
)

new = []
for i in range(len(merged_dat)):
    if merged_dat.iloc[i]['price_y']

In [None]:
merged_dat.iloc[16879]

In [None]:
db_dat.query("product_id == 'pimprod2006617'")

In [None]:
not_in_db = (
    current_ulta_df
    .pipe(pd.merge, db_dat, on='product_id', how='left')
    .dropna(subset=['price_str'])
    .query('price != price_str | rating_x != rating_y')
    .drop(columns={'rating_y', 'no_of_reviews_y', 'offers_y', 'max_date', 'price_str'})
    .rename(columns={'rating_x' : 'rating', 'no_of_reviews_x' : 'no_of_reviews', 'offers_x' : 'offers'})
)

In [None]:
#if you make a mistake, execute following code afterwards
#cur.execute("rollback;")

In [None]:
conn.close()