In [1]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
import google_api_functions as gapi
import google_sheets_credentials as creds
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import copy
import concurrent.futures
import json

In [436]:
def get_products_in_stock(secret_sales, driver):
    products_in_stock = {}
    for product in secret_sales:
        variants_in_stock = {}
        temp = {}
        #opening product url in the driver/browser
        driver.get(secret_sales[product]['url'])
        #if the product doesn't exist anymore ulta wil take you to this site
        if driver.current_url == 'https://www.ulta.com/404.jsp':
            next
        #making sure that the url is correct! it wasn't for a couple of the products for some reason idk why. but I'm 
        #fixing it in this step.
        elif driver.current_url.split('productId=')[1] != secret_sales[product]['id']:
            driver.find_element_by_xpath("//*[@id='navigation__wrapper--sticky']/div/div[1]/div[2]/div/a").click()
            driver.find_element_by_xpath("//*[@id='searchInput']").send_keys(secret_sales[product]['id'])
            driver.find_element_by_xpath("//*[@id='js-mobileHeader']/div/div/div/div[1]/div/div[1]/form/button").click()
            if driver.current_url == 'https://www.ulta.com/404.jsp':
                next
            elif driver.current_url.split('productId=')[1] == secret_sales[product]['id']:
                secret_sales[product]['url'] = driver.current_url
        #if I don't add this sleep, the page doesn't finish loading. tried to use implicit waits but this just worked better.
        time.sleep(1)
        #getting all the product variants from the page
        product_variants = driver.find_elements_by_class_name('ProductSwatchImage__variantHolder')
        if len(product_variants) == 0:
            #products that only have one color or one size or whatever have their product variant information in a different lcoation
            product_variants = driver.find_elements_by_class_name('ProductDetail__productSwatches')
        for product_variant in product_variants:
            try:
                #clicking on each variant at a time to get their price and availability
                product_variant.click()
            except:
                #if I can't click on it I want to go to the next variant
                next
            else:
                #if I don't add this sleep, the page doesn't finish loading. tried to use implicit waits but this just worked better.
                time.sleep(1)
                #creating a BeautifulSoup object to extract data
                soup = BeautifulSoup(driver.page_source, features="lxml")
                #getting price
                price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
                #attempting to catch other secret sale items that don't end with .97
                if price.endswith('0') == False and price.endswith('9') == False:
                    #the option is sometimes in different locations
                    option = soup.find('meta', {'property' : 'product:color'}).get('content')
                    #checking other possible locations of option
                    if option == '' and soup.find('div', {'class' : 'ProductDetail__colorPanel'}) is not None:
                        option_tag = soup.find('div', {'class' : 'ProductDetail__colorPanel'}).find_all('span')[1]
                        if option_tag is not None:
                            option = option_tag.text
                    if option == '' and soup.find('span', {'class' : 'ProductVariantSelector__description'}) is not None:
                        option = soup.find('span', {'class' : 'ProductVariantSelector__description'}).text
                    #putting the option as 'NA' if I can't find its label
                    if option == '':
                        option = 'NA'
                    #only adding the product variant if it's available
                    if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                        temp[option] = price
        #checking if the temp dictionary is empty to make sure if there are indeed product variants in stock
        if bool(temp):
            #rearranging the dictionary to group variants with the same size together and putting the different options in a single string
            #so that, in the end, for each product, there is a dictionary including the different price options and, for each price option, 
            #a string containing the options (colors, sizes) available for that price point. 
            for key, value in temp.items():
                variants_in_stock.setdefault(value, set()).add(key)
            for key, value in variants_in_stock.items():
                new_value = ", ".join(value)
                variants_in_stock[key] = new_value
            products_in_stock[secret_sales[product]['id']] = variants_in_stock
        else:
            #if there aren't any product variants in stock, I don't want them in the document
            next
    return(products_in_stock, secret_sales)

In [2]:
print("\nstarting...\n")

session = requests.Session()
all_url_info = {}
products = {}


starting...



In [3]:
f = open("data/all_url_info_dict.json","r")
all_url_info = json.loads(f.read())
f.close()
urls = list(all_url_info.keys())

In [5]:
print('scraping ulta...')
#I'm using threading to make the code run faster
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(ulta.scrape_url, url, session, products, all_url_info): url for url in urls}
    for future in concurrent.futures.as_completed(futures):
        url = futures[future]
        try:
            data = future.result()
        except Exception as exc:
            print(url, ':', exc)
        else:
            products = data

scraping ulta...


In [274]:
ulta_df = pd.DataFrame.from_dict(products).transpose()
ulta_df.index.name = 'name'
len(ulta_df)

16623

In [363]:
old_ulta_df = pd.read_csv('data/ulta_df.csv').rename(columns={'price' : 'old_price', 'sale' : 'old_sale', 'secret_sale' : 'old_secret_sale'}).set_index('id')

In [364]:
old_ulta_df = old_ulta_df[['old_price', 'old_sale', 'old_secret_sale']]

In [395]:
changed_prices_df = pd.merge(ulta_df.reset_index().rename(columns={'index' : 'name'}).set_index('id'), old_ulta_df, on='id', how='inner').query('price != old_price').dropna(subset=['price', 'old_price']).query('sale == 0 & old_sale == 0')

In [396]:
df = copy.deepcopy(changed_prices_df)
for i in range(len(changed_prices_df)):
    if '-' in changed_prices_df.iloc[i]['old_price'] and '-' not in changed_prices_df.iloc[i]['price'] and changed_prices_df.iloc[i]['old_price'].split(' - ')[1] == changed_prices_df.iloc[i]['price']:
        df = df.drop([changed_prices_df.iloc[i].name])
    elif '-' in changed_prices_df.iloc[i]['price'] and '-' not in changed_prices_df.iloc[i]['old_price'] and changed_prices_df.iloc[i]['price'].split(' - ')[1] > changed_prices_df.iloc[i]['old_price']:
        df = df.drop([changed_prices_df.iloc[i].name])
    elif '-' not in changed_prices_df.iloc[i]['price'] and '-' not in changed_prices_df.iloc[i]['old_price'] and float(changed_prices_df.iloc[i]['price'][1:]) >= float(changed_prices_df.iloc[i]['old_price'][1:]):
        df = df.drop([changed_prices_df.iloc[i].name])
changed_prices_df = copy.deepcopy(df).drop(columns={'old_price', 'old_sale', 'old_secret_sale'})

In [407]:
print('data cleaning...')
#cleaning the data
#df of every product on ulta's website
ulta_df = pd.DataFrame.from_dict(products).transpose()
#df of only the secret sales
secret_sales_df = pd.DataFrame.drop_duplicates(pd.concat([copy.deepcopy(ulta_df.query('secret_sale == 1 & sale == 0')).reset_index().rename(columns={'index' : 'name'}).set_index('id'), changed_prices_df]))
#a couple of the items had an incorrect url for some reason so I turned secret_sales into a dictionary so I can fix a product's url if needed
secret_sales = pd.DataFrame.to_dict(secret_sales_df.reset_index().set_index('name').transpose())

data cleaning...


In [437]:
driver = webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe')

In [438]:
products_in_stock, secret_sales = get_products_in_stock(secret_sales, driver)

In [456]:
session.close()
driver.close()
driver.quit()

In [450]:
#more data cleaning...
products_in_stock_df = pd.DataFrame.from_dict(products_in_stock).transpose().reset_index().rename(columns={'index' : 'id'})
products_in_stock_df = pd.melt(products_in_stock_df, id_vars=['id'], var_name='price2', value_name='options2').dropna().set_index('id')
secret_sales_df = pd.DataFrame.from_dict(secret_sales).transpose().reset_index().rename(columns={'index' : 'name'}).set_index('id')

In [451]:
secret_sales_in_stock = pd.merge(products_in_stock_df, secret_sales_df, on='id', how='left')

In [452]:
secret_sales_in_stock = secret_sales_in_stock.drop(columns={'price', 'options'}).reset_index().rename(columns={'price2' : 'price', 'options2' : 'options', 'desc' : 'product', 'index' : 'id'})
df = secret_sales_in_stock[['main_category', 'sub_category', 'sub_sub_category', 'name', 'brand', 'product', 'price', 'options', 'offers', 'rating', 'number_of_reviews', 'url']].fillna(' ')

In [454]:
print("updating sheet hosted on mod's google drive...")
#update the sheet hosted on the mod's google drive
gapi.Create_Service(creds.get_credentials_file('main_mod'), creds.get_token_write_file('main_mod'), 'sheets', 'v4', ['https://www.googleapis.com/auth/spreadsheets'])
gapi.Clear_Sheet(creds.get_sheet_id('main_mod'))
gapi.Export_Data_To_Sheets(creds.get_sheet_id('main_mod'), df)
gapi.Update_Filter(creds.get_sheet_id('main_mod'), creds.get_filter_id('main_mod'), len(df), len(df.columns))

print('updating sheet hosted on my google drive...')
#update the sheet hosted on my google drive
gapi.Create_Service(creds.get_credentials_file('main_local'), creds.get_token_write_file('main_local'), 'sheets', 'v4', ['https://www.googleapis.com/auth/spreadsheets'])
gapi.Clear_Sheet(creds.get_sheet_id('main_local'))
gapi.Export_Data_To_Sheets(creds.get_sheet_id('main_local'), df)
gapi.Update_Filter(creds.get_sheet_id('main_local'), creds.get_filter_id('main_local'), len(df), len(df.columns))

updating sheet hosted on mod's google drive...
sheets service created successfully
Sheet successfully cleared
Sheet successfully updated
Filter successfully updated
updating sheet hosted on my google drive...
sheets service created successfully
Sheet successfully cleared
Sheet successfully updated
Filter successfully updated


In [None]:
def Export_Data_To_Sheets(gsheetId, df):
    response_date = service.spreadsheets().values().update(
        spreadsheetId=gsheetId,
        valueInputOption='RAW',
        range=SAMPLE_RANGE_NAME,
        body=dict(
            majorDimension='ROWS',
            values=df.T.reset_index().T.values.tolist())
    ).execute()
    print('Sheet successfully updated')

In [None]:
def Update_Filter(gsheetId, filterId, rows, cols):
    my_range = {
    'sheetId': 0,
    'startRowIndex': 0,
    'startColumnIndex': 0,
    'endRowIndex': rows + 1,
    'endColumnIndex': cols
    }
    
    updateFilterViewRequest = {
        'updateFilterView': {
            'filter': {
                'filterViewId': filterId,
                'range': my_range
            },
            'fields': {
                'paths': 'range'
            }
        }
    }
    
    body = {'requests': [updateFilterViewRequest]}
    service.spreadsheets().batchUpdate(spreadsheetId=gsheetId, body=body).execute()
    print('Filter successfully updated')

In [None]:
def addHyperlink(hyperlink, text, sheetId, rowIndex, colIndex):
    requests = []
    requests.append({
        "updateCells": {
            "rows": [
                {
                    "values": [{
                        "userEnteredValue": {
                            "formulaValue":"=HYPERLINK({},{})".format(hyperlink, text) 
                        }
                    }]
                }
            ],
            "fields": "userEnteredValue",
            "start": {
                "sheetId": sheetId,
                "rowIndex": rowIndex,
                "columnIndex": colIndex
            }
        }})
    body = {
        "requests": requests
    }
    request = service.spreadsheets().batchUpdate(spreadsheetId=self.spreadsheetId, body=body)
    return request.execute()