In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
import copy
import time
import math
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
import psycopg2
from psycopg2.errors import UniqueViolation
import config
import ulta_functions as ulta

In [2]:
def get_driver(headless, old_driver = ''):
    #check if driver is closed. if it's not, close it.
    if old_driver != '':
        session_id = old_driver.session_id
        if session_id != None:
            old_driver.close()
            old_driver.quit()
    #create new driver and return it.    
    if headless == 1:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        new_driver = webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe', options = chrome_options)
    else:
        new_driver = webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe')
    print(new_driver.session_id)
    return(new_driver)

In [3]:
def get_products_in_stock(secret_sales):
    driver = get_driver(0)
    wait = WebDriverWait(driver, 30)
    counter = 0
    products_in_stock = {}
    for product_id in secret_sales:
        counter = counter + 1
        if counter % 100 == 0:
            driver = get_driver(0, driver)
            wait = WebDriverWait(driver, 30)
            time.sleep(60)
        variants_in_stock = {}
        driver.get(secret_sales[product_id]['url'])
        time.sleep(1)
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants) == 0:
            product_variants = driver.find_elements_by_class_name('ProductDetail__productSwatches')
        for product_variant in product_variants:
            try:
                product_variant.click()
            except:         
                next
            else:
                wait.until(EC.presence_of_element_located((By.XPATH, "/html/head/meta[10]")))
                soup = BeautifulSoup(driver.page_source, features="lxml")
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                option = ulta.get_option(soup)
                variants_in_stock[option] = price
        products_in_stock[product_id] = variants_in_stock
    return(products_in_stock)

In [4]:
def add_row_product_tbl(conn, product_id, sku_id, brand, product_name, url):
    cur = conn.cursor()
    cur.execute("rollback;")
    sql = """INSERT INTO product (product_id, sku_id, brand, product_name, url) VALUES (%s, %s, %s, %s, %s) RETURNING product_pkey"""
    try:
        cur.execute(sql, (product_id, sku_id, brand, product_name, url))
        product_pkey = cur.fetchone()[0]
    except UniqueViolation:
        cur.execute("rollback;")
        query = """SELECT product_pkey FROM product WHERE (product_id = %s)"""
        cur.execute(query, (product_id,))
        product_pkey = cur.fetchone()[0]
    finally:
        cur.close()
        return(product_pkey)

In [9]:
for root, dirs, files in os.walk("./data"):
    for filename in files:
        if 'ulta_df_' in filename:
            print(filename)

ulta_df_813.csv
ulta_df_823.csv
ulta_df_827.csv
ulta_df_829.csv
ulta_df_831.csv
ulta_df_901.csv
ulta_df_903.csv
ulta_df_904.csv
ulta_df_905.csv
ulta_df_909.csv
ulta_df_910.csv
ulta_df_911.csv


## 2 tables: product and price

In [10]:
ulta_df = (
    pd.read_csv('data/ulta_df_911.csv')
    .rename_axis('ulta_id')
    .rename(columns={'skuid' : 'sku_id'})
)

### product

In [12]:
product_df = ulta_df.loc[:, ['product_id', 'sku_id', 'brand', 'product', 'url']]

### price

In [61]:
price_df = (
    ulta_df
    .loc[:, ['product_id', 'url', 'price', 'options', 'sale', 'sale_price']]
)

In [62]:
def fix_price(row):
    if row['sale'] == 0:
        val = row['price']
    else:
        val = row['sale_price']
    return val

In [63]:
price_df['price_str'] = price_df.apply(fix_price, axis=1)

In [64]:
price_df = price_df.drop(columns={'sale_price'})

In [65]:
price_df = price_df.fillna(value={'options' : ''})

In [66]:
def set_type(row):
    options = row['options']
    if 'Colors' in options or 'Scents' in options:
        val = 1
    elif options == '':
        val = 2
    else:
        val = 3
    return val

In [67]:
price_df['type'] = price_df.apply(set_type, axis=1)

In [68]:
def set_option(row):
    if row['type'] == 1:
        if 'Colors' in row['options']:
            val = 'colors'
        elif 'Scents' in row['options']:
            val = 'scents'
    elif row['type'] == 2:
        val = 'single'
    return val

In [69]:
no_search_df = (
    price_df.query('type == 1 | type == 2')
    .pipe(copy.deepcopy)
)

In [70]:
no_search_df['option'] = no_search_df.apply(set_option, axis=1)

In [71]:
no_search_df = no_search_df.rename(columns={'options' : 'option_desc'})

In [72]:
def get_price(row):
    if '-' in row['price_str']:
        val = float(row['price_str'].split(' - ')[0][1:])
    else:
        val = float(row['price_str'][1:])
    return val

In [73]:
no_search_df['price_num'] = no_search_df.apply(get_price, axis=1)
no_search_df = no_search_df.drop(columns={'price', 'url', 'type'})

In [74]:
search_df = (
    price_df.query('type == 3')
    .pipe(copy.deepcopy)
)

search = (
    search_df#[0:200]
    .set_index('product_id')
    .transpose()
    .pipe(pd.DataFrame.to_dict)
)

In [29]:
products_in_stock = get_products_in_stock(search)

74b0ec201f2cb319662d21a0dcfe1d0e
cac30e8c8149a6bdea280541220b42b4
a1976d5726deb4b7d0f2da2079f1174c
67f6a4eac14d946b2f6069b4fbad748e
9aca2dbc0264898fa3cbab7f024100bc
41951a95e72abc38613b98093d04428f
b64f04ad500143108fd687bb38936647
85392e5ed329e52d9fb15a83fbe05139
520f9df2ad7dba17f2f44bc48c67ddd8
c19f705853b4f288c38b363230d06bf1
10a795fa11b400fe4f1026891a6a2477


In [75]:
search_df = (
    pd.DataFrame.from_dict(products_in_stock)
    .transpose()
    .reset_index()
    .rename(columns={'index' : 'product_id'})
    .pipe(pd.melt, id_vars=['product_id'], var_name='option', value_name='price_num')
    .dropna()
    .set_index('product_id')
    .pipe(pd.merge, search_df, on='product_id', how='left')
    .drop(columns={'price', 'url', 'type'})
    .rename(columns={'options' : 'option_desc'})
)

In [76]:
search_df

Unnamed: 0,product_id,option,price_num,option_desc,sale,price_str
0,xlsImpprod3590053,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
1,xlsImpprod10991261,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
2,pimprod2014100,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
3,xlsImpprod3590053,1.0 oz,39.00,2 Sizes,0,$15.00 - $39.00
4,xlsImpprod3590041,1.0 oz,37.00,2 Sizes,0,$15.00 - $37.00
...,...,...,...,...,...,...
1873,prod5022101,18.0 oz,11.99,3 Sizes,0,$8.99 - $11.99
1874,VP00254,21.0 oz,33.00,2 Sizes,0,$33.00 - $44.00
1875,1887,21.0 oz,33.00,2 Sizes,0,$33.00 - $44.00
1876,pimprod2011277,XS/S,14.99,2 Sizes,0,$14.99


In [77]:
price_df = pd.concat([search_df.set_index('product_id'), no_search_df.set_index('product_id')]).reset_index()

In [78]:
price_df

Unnamed: 0,product_id,option,price_num,option_desc,sale,price_str
0,xlsImpprod3590053,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
1,xlsImpprod10991261,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
2,pimprod2014100,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00
3,xlsImpprod3590053,1.0 oz,39.00,2 Sizes,0,$15.00 - $39.00
4,xlsImpprod3590041,1.0 oz,37.00,2 Sizes,0,$15.00 - $37.00
...,...,...,...,...,...,...
17741,xlsImpprod18411062,single,8,,0,$8.00
17742,xlsImpprod17361149,single,9.99,,0,$9.99
17743,pimprod2016011,single,16,,0,$16.00
17744,xlsImpprod18241089,single,17.95,,0,$17.95


In [79]:
price_df['date'] = ['09/11/2020'] * len(price_df)

In [80]:
price_df = price_df.astype({'sale': 'str'})

In [81]:
product_df = product_df.astype({'sku_id' : 'str'})

In [82]:
product_df

Unnamed: 0_level_0,product_id,sku_id,brand,product,url
ulta_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,xlsImpprod5770263,2264077,It Cosmetics,Bye Bye Redness Neutralizing Color-Correcting ...,https://www.ulta.com/bye-bye-redness-neutraliz...
1,xlsImpprod15711051,2540112,L.A. Girl,HD Pro Concealer,https://www.ulta.com/hd-pro-concealer?productI...
2,xlsImpprod14491009,2503509,Urban Decay Cosmetics,Naked Skin Color Correcting Fluid,https://www.ulta.com/naked-skin-color-correcti...
3,xlsImpprod3590053,2233970,Smashbox,Photo Finish Reduce Redness Primer,https://www.ulta.com/photo-finish-reduce-redne...
4,xlsImpprod10791925,2222453,Clinique,Redness Solutions Makeup Broad Spectrum SPF 15...,https://www.ulta.com/redness-solutions-makeup-...
...,...,...,...,...,...
16900,xlsImpprod18411062,2524747,Redken,Travel Size Brews Grip Tight Holding Gel,https://www.ulta.com/travel-size-brews-grip-ti...
16901,xlsImpprod17361149,2520342,Every Man Jack,Cedarwood Grooming Paste,https://www.ulta.com/cedarwood-grooming-paste?...
16902,pimprod2016011,2563297,Frederick Benjamin,Crown Control Forming Creme,https://www.ulta.com/crown-control-forming-cre...
16903,xlsImpprod18241089,2525333,American Crew,Techseries Boost Spray,https://www.ulta.com/techseries-boost-spray?pr...


In [83]:
price_df

Unnamed: 0,product_id,option,price_num,option_desc,sale,price_str,date
0,xlsImpprod3590053,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00,09/11/2020
1,xlsImpprod10991261,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00,09/11/2020
2,pimprod2014100,0.41 oz,15.00,2 Sizes,0,$15.00 - $39.00,09/11/2020
3,xlsImpprod3590053,1.0 oz,39.00,2 Sizes,0,$15.00 - $39.00,09/11/2020
4,xlsImpprod3590041,1.0 oz,37.00,2 Sizes,0,$15.00 - $37.00,09/11/2020
...,...,...,...,...,...,...,...
17741,xlsImpprod18411062,single,8,,0,$8.00,09/11/2020
17742,xlsImpprod17361149,single,9.99,,0,$9.99,09/11/2020
17743,pimprod2016011,single,16,,0,$16.00,09/11/2020
17744,xlsImpprod18241089,single,17.95,,0,$17.95,09/11/2020


## add tables to database

### connect to database

In [90]:
params = config.config()
conn = psycopg2.connect(**params)

### make sure product isn't already in database by adding product_pkey column where value is -1 if it does not exist

In [91]:
cur = conn.cursor()
cur.execute("""SELECT product_id, product_pkey FROM product""")
r = cur.fetchall()

merged_product_df = (
    pd.merge(
        product_df,
        pd.DataFrame(r, columns=['product_id', 'product_pkey']), 
        on = 'product_id', 
        how = 'left'
    )
    .fillna(value = {'product_pkey' : -1})
    .astype({'product_pkey' : 'float64'})
)

cur.close()

In [92]:
merged_product_df

Unnamed: 0,product_id,sku_id,brand,product,url,product_pkey
0,xlsImpprod5770263,2264077,It Cosmetics,Bye Bye Redness Neutralizing Color-Correcting ...,https://www.ulta.com/bye-bye-redness-neutraliz...,85557.0
1,xlsImpprod15711051,2540112,L.A. Girl,HD Pro Concealer,https://www.ulta.com/hd-pro-concealer?productI...,85558.0
2,xlsImpprod14491009,2503509,Urban Decay Cosmetics,Naked Skin Color Correcting Fluid,https://www.ulta.com/naked-skin-color-correcti...,85559.0
3,xlsImpprod3590053,2233970,Smashbox,Photo Finish Reduce Redness Primer,https://www.ulta.com/photo-finish-reduce-redne...,85560.0
4,xlsImpprod10791925,2222453,Clinique,Redness Solutions Makeup Broad Spectrum SPF 15...,https://www.ulta.com/redness-solutions-makeup-...,85561.0
...,...,...,...,...,...,...
16900,xlsImpprod18411062,2524747,Redken,Travel Size Brews Grip Tight Holding Gel,https://www.ulta.com/travel-size-brews-grip-ti...,102414.0
16901,xlsImpprod17361149,2520342,Every Man Jack,Cedarwood Grooming Paste,https://www.ulta.com/cedarwood-grooming-paste?...,102421.0
16902,pimprod2016011,2563297,Frederick Benjamin,Crown Control Forming Creme,https://www.ulta.com/crown-control-forming-cre...,102416.0
16903,xlsImpprod18241089,2525333,American Crew,Techseries Boost Spray,https://www.ulta.com/techseries-boost-spray?pr...,102423.0


### write them to database

In [93]:
row_tuples = ()
for i in range(len(merged_product_df)):
    row = merged_product_df.iloc[i]
    if row['product_pkey'] != -1 or row['product_pkey'] != -1.0:
        product_pkey = row['product_pkey']
    else:
        product_pkey = add_row_product_tbl(conn, row['product_id'], row['sku_id'], row['brand'], row['product'], row['url'])
    df = price_df.query("product_id == '{product_id}'".format(product_id = row['product_id']))
    for j in range(len(df)):
        row = df.iloc[j]
        row_tuple = (row['option'], row['option_desc'], row['price_num'], row['price_str'], row['sale'], row['date'], product_pkey)
        row_tuples = (row_tuple,) + row_tuples

In [94]:
cur = conn.cursor()
sql = """INSERT INTO price (option, option_desc, price_num, price_str, sale, price_entry_date, product_pkey_foreign) VALUES (%s, %s, %s, %s, %s, %s, %s)"""
cur.executemany(sql, row_tuples)
cur.close()

### commit the changes and close connection

In [95]:
conn.commit()
conn.close()

In [None]:
#problem- original idea was to group the products by 'option' to prevent the case where, if a color is added at the option_desc changes from '22 colors' to '23 colors', it would be treated
#the same. my idea with the type 3 products was to have the option be sizes and the option_desc be 4 oz or whatever the option I got from selenium but then the option value would be the 
#same for each product variation which isn't accurate. so I think I need to have each product variation have its own individual entry in the price table and have 'option' by the selenium
#value and have 'option_desc' be the og options ('2 Sizes, etc'). 