In [1]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
import google_api_functions as gapi
import google_sheets_credentials as creds
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import copy
import concurrent.futures
import json
import datetime
import psycopg2
from psycopg2.errors import UniqueViolation
import config

In [2]:
def get_products_in_stock(driver, secret_sales):
    wait = WebDriverWait(driver, 30)
    products_in_stock = {}
    for product_id in secret_sales:
        temp = {} #used to temporarily store product data until 
        #opening product url in the driver/browser
        driver.get(secret_sales[product_id]['url'])
        #if the product doesn't exist anymore ulta wil take you to this site
        if driver.current_url == 'https://www.ulta.com/404.jsp':
            next
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ProductSwatchImage__variantHolder')))
        #getting all the product variants from the page
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants = driver.find_elements_by_class_name('ProductDetail__productSwatches')
        for product_variant in product_variants:
            try:
                product_variant.click() #clicking on each variant at a time to get their price and availability
            except:         
                next #if I can't click on it I want to go to the next variant
            else:
                wait.until(EC.presence_of_element_located((By.XPATH, "/html/head/meta[10]")))
                #creating a BeautifulSoup object to extract data
                soup = BeautifulSoup(driver.page_source, features="lxml")
                #there are products that only a couple of shades are labeled as sale so I'm removing those to make sure no sale items slip through
                if soup.find('img', {'src' : 'https://images.ulta.com/is/image/Ulta/badge-sale?fmt=png-alpha'}) is not None:
                    next
                #getting price
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                option = ulta.get_option(soup)
                #only adding the product variant if it's available
                if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                    temp[option] = price
        #checking if the temp dictionary is empty to make sure if there are indeed product variants in stock
        if bool(temp):
            variants_in_stock = ulta.rearrange_product_dict(temp)
            products_in_stock[product_id] = variants_in_stock
        else:
            #if there aren't any product variants in stock, I don't want them in the document
            next
    return(products_in_stock)

In [3]:
def add_new_product_2(conn, product_id, brand, product, main_category, sub_category, sub_sub_category, url, rating, no_of_reviews, offers, date, price, options):
    bool_continue = False
    cur = conn.cursor()
    #insert product into products table
    sql = """INSERT INTO products (product_id, brand, product, main_category, sub_category, sub_sub_category, url) VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING products_pkey"""
    try:
        cur.execute(sql, (product_id, brand, product, main_category, sub_category, sub_sub_category, url))
        products_pkey = cur.fetchone()[0]
        bool_continue = True
    except UniqueViolation:
        cur.execute("SELECT products_pkey FROM products WHERE product_id=%s", (product_id,))
        products_pkey = cur.fetchone()[0]
        cur.execute("SELECT extra_pkey FROM extra WHERE products_pkey_foreign=%s", (products_pkey,))
        if cur.fetchone() is None: #that means a product has data in the products table but not in the extra table so we still want to add the product data in the extra table
            bool_continue = True
        else:
            bool_continue = False #that means the product already has data in the products table so it's not actually a new product
    finally:
        if bool_continue: #aka if bool_continue == True
            #insert product into extra table
            sql = """INSERT INTO extra (rating, no_of_reviews, offers, date, price, options, products_pkey_foreign) VALUES (%s, %s, %s, %s, %s, %s, %s)"""
            cur.execute(sql, (rating, no_of_reviews, offers, date, price, options, products_pkey))
        else:
            return #return aka don't do anything

In [4]:
def add_new_product(conn, product_id, brand, product, main_category, sub_category, sub_sub_category, url, rating, no_of_reviews, offers, date):
    bool_continue = False
    cur = conn.cursor()
    #insert product into products table
    sql = """INSERT INTO products (product_id, brand, product, main_category, sub_category, sub_sub_category, url) VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING products_pkey"""
    try:
        cur.execute(sql, (product_id, brand, product, main_category, sub_category, sub_sub_category, url))
        products_pkey = cur.fetchone()[0]
        bool_continue = True
    except UniqueViolation:
        cur.execute("SELECT products_pkey FROM products WHERE product_id=%s", (product_id,))
        products_pkey = cur.fetchone()[0]
        cur.execute("SELECT extra_pkey FROM extra WHERE products_pkey_foreign=%s", (products_pkey,))
        if cur.fetchone() is None: #that means a product has data in the products table but not in the extra table so we still want to add the product data in the extra table
            bool_continue = True
        else:
            bool_continue = False #that means the product already has data in the products table so it's not actually a new product
    finally:
        if bool_continue: #aka if bool_continue == True
            #insert product into extra table
            sql = """INSERT INTO ratings_and_offers (rating, no_of_reviews, offers, date, products_pkey_foreign) VALUES (%s, %s, %s, %s, %s)"""
            cur.execute(sql, (rating, no_of_reviews, offers, date, products_pkey))
        else:
            return #return aka don't do anything

In [5]:
def remove_padding(df, column_name):
    no_padding = []
    for i in range(len(df)):
        no_padding.append(df.iloc[i][column_name].strip())
    df[column_name] = no_padding
    return(df)

# step 1: scrape current data from ulta.com

In [6]:
session = requests.Session()
all_url_info = ulta.get_url_dict(session)
urls = all_url_info.keys()

In [7]:
products = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(ulta.scrape_url, url, session, products, all_url_info): url for url in urls}
    for future in concurrent.futures.as_completed(futures):
        url = futures[future]
        try:
            data = future.result()
        except Exception as exc:
            print(url, ':', exc)
        else:
            products = data

https://www.ulta.com/mens-fragrance-cologne?N=26wg&No=0&Nrpp=500 131
list index out of range 

https://www.ulta.com/mens-fragrance?N=26wf&No=0&Nrpp=500 135
list index out of range 

https://www.ulta.com/men-cologne?N=1wrfdjd&No=0&Nrpp=500 107
list index out of range 



In [14]:
session.close()

In [21]:
current_ulta_df = (
    pd.DataFrame.from_dict(products)
    .transpose()
    .rename_axis('product_id')
)

In [22]:
current_ulta_df.to_csv('data/ulta_df_909.csv')

### dealing with sale prices

if there's a sale price, that's the price the product is currently being sold for, so I want the price of the product in the database to be the sale price. so if there's a sale price I'm replacing the price with the sale price.

In [16]:
def fix_price(row):
    if row['sale'] == 0:
        val = row['price']
    else:
        val = row['sale_price']
    return val

In [19]:
current_ulta_df['price_str'] = current_ulta_df.apply(fix_price, axis=1)
current_ulta_df['date'] = [datetime.date.today().strftime('%Y-%m-%d')] * len(current_ulta_df)
current_ulta_df = (
    current_ulta_df
    .drop(columns={'sale_price'})
    .fillna(value={'options' : ''})
)

In [20]:
current_ulta_df

Unnamed: 0_level_0,sku_id,brand,product,url,rating,no_of_reviews,sale,price,offers,options,main_category,sub_category,sub_sub_category,price_str,date
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
xlsImpprod5770263,2264077,It Cosmetics,Bye Bye Redness Neutralizing Color-Correcting ...,https://www.ulta.com/bye-bye-redness-neutraliz...,4.50,2858,0,$34.00,Free Gift with Purchase!,3 Colors,makeup,face,,$34.00,2020-09-09
xlsImpprod15711051,2540112,L.A. Girl,HD Pro Concealer,https://www.ulta.com/hd-pro-concealer?productI...,4.10,1715,0,$4.99,"Buy 2, get 1 FREE - Add 3 items to qualify!",43 Colors,makeup,face,,$4.99,2020-09-09
xlsImpprod14491009,2503509,Urban Decay Cosmetics,Naked Skin Color Correcting Fluid,https://www.ulta.com/naked-skin-color-correcti...,4.40,844,0,$29.00,Free Gift with Purchase!,3 Colors,makeup,face,,$29.00,2020-09-09
xlsImpprod3590053,2233970,Smashbox,Photo Finish Reduce Redness Primer,https://www.ulta.com/photo-finish-reduce-redne...,4.10,854,0,$15.00 - $39.00,,2 Sizes,makeup,face,,$15.00 - $39.00,2020-09-09
xlsImpprod10791925,2222453,Clinique,Redness Solutions Makeup Broad Spectrum SPF 15...,https://www.ulta.com/redness-solutions-makeup-...,4.60,193,0,$31.00,Free Gift with Purchase!,4 Colors,makeup,face,,$31.00,2020-09-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xlsImpprod18411062,2524747,Redken,Travel Size Brews Grip Tight Holding Gel,https://www.ulta.com/travel-size-brews-grip-ti...,0.00,,0,$8.00,,,men,hair,,$8.00,2020-09-09
xlsImpprod17361149,2520342,Every Man Jack,Cedarwood Grooming Paste,https://www.ulta.com/cedarwood-grooming-paste?...,0.00,,0,$9.99,,,men,hair,,$9.99,2020-09-09
pimprod2016011,2563297,Frederick Benjamin,Crown Control Forming Creme,https://www.ulta.com/crown-control-forming-cre...,0,,0,$16.00,,,men,hair,,$16.00,2020-09-09
xlsImpprod18241089,2525333,American Crew,Techseries Boost Spray,https://www.ulta.com/techseries-boost-spray?pr...,0,,0,$17.95,"Buy 1, get 1 at 50% off!",,men,hair,,$17.95,2020-09-09


# step 2: find secret sales

### connect to database

In [None]:
params = config.config()
conn = psycopg2.connect(**params)

In [None]:
cur = conn.cursor()

In [None]:
query = """
    SELECT 
        tbl.product_id, rao.rating, rao.no_of_reviews, rao.offers, tbl.max_date, prices.price_str, tbl.max_price, tbl.option
    FROM (
        SELECT 
            products.products_pkey, products.product_id, prices.option, MAX(prices.price) as max_price, MAX(rao.date) as max_date
        FROM products
        LEFT JOIN ratings_and_offers rao ON products.products_pkey = rao.products_pkey_foreign
        LEFT JOIN prices ON rao.ratings_and_offers_pkey = prices.ratings_and_offers_pkey_foreign
        GROUP BY products.products_pkey, products.product_id, prices.option
        ) tbl
    LEFT JOIN ratings_and_offers rao ON rao.products_pkey_foreign = tbl.products_pkey AND rao.date = tbl.max_date
    LEFT JOIN prices ON prices.ratings_and_offers_pkey_foreign = rao.ratings_and_offers_pkey AND prices.option = tbl.option
    ORDER BY tbl.product_id
"""

In [None]:
cur.execute(query)

In [None]:
r = cur.fetchall()

In [None]:
cur.close()

In [None]:
db_dat = pd.DataFrame(r, columns=['product_id', 'rating', 'no_of_reviews', 'offers', 'max_date', 'price_str', 'max_price', 'option'])

latest_ulta_dat = (
    db_dat
    .drop(columns={'max_price', 'option'})
    .pipe(pd.DataFrame.drop_duplicates)
    .sort_values(['product_id', 'max_date'])
    .pipe(pd.DataFrame.drop_duplicates, 'product_id', keep='last')
    .set_index('product_id')
)

In [None]:
latest_ulta_dat

In [None]:
merged_dat = (
    current_ulta_df
    .pipe(pd.merge, latest_ulta_dat, on='product_id', how='left')
)

new = []
for i in range(len(merged_dat)):
    if merged_dat.iloc[i]['price_y']

In [None]:
merged_dat.iloc[16879]

In [None]:
db_dat.query("product_id == 'pimprod2006617'")

In [None]:
not_in_db = (
    current_ulta_df
    .pipe(pd.merge, db_dat, on='product_id', how='left')
    .dropna(subset=['price_str'])
    .query('price != price_str | rating_x != rating_y')
    .drop(columns={'rating_y', 'no_of_reviews_y', 'offers_y', 'max_date', 'price_str'})
    .rename(columns={'rating_x' : 'rating', 'no_of_reviews_x' : 'no_of_reviews', 'offers_x' : 'offers'})
)

In [None]:
#if you make a mistake, execute following code afterwards
#cur.execute("rollback;")

In [None]:
conn.close()