In [1]:
import pandas as pd
import numpy as np
import requests
from retrying import retry
import re
from bs4 import BeautifulSoup
import time
import math
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
import os
import pickle
import ulta_functions as ulta
from selenium import webdriver
import copy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

In [3]:
session = requests.Session()

In [237]:
all_url_info = {}
front_page = session.get('https://www.ulta.com/')
front_page_soup = BeautifulSoup(front_page.content)
for anchor in front_page_soup.find_all('a', {'class' : 'Anchor'}):
    if anchor.get('data-nav-description') is not None and anchor.get('data-nav-description').startswith('m') and re.search(r'[a-z]*:[a-z]*', anchor.get('data-nav-description')) is not None:
        directories = anchor.get('data-nav-description')[4:].split(':')
        if 'featured' not in directories[1] and directories[0] not in ['new arrivals', 'ulta beauty collection', 'gifts', 'sale & coupons', 'beauty tips']:
            base_url = anchor.get('href')
            base_page = session.get(base_url)
            base_page_soup = BeautifulSoup(base_page.content)
            num_results = int(re.findall(r'\b\d+\b', base_page_soup.find('h2', {'class' : 'search-res-title'}).find('span', {'class' : 'sr-only'}).text)[0])
            for i in range(math.ceil(num_results / 500)):
                url_info = {}
                url_info['main_category'] = directories[0]
                url_info['sub_category'] = directories[1]
                if len(directories) == 2:
                    url_info['sub_sub_category'] = '_'
                else:
                    url_info['sub_sub_category'] = directories[2]
                url = base_url + '&No=' + str(j * 500) + '&Nrpp=500'
                all_url_info[url] = url_info    

In [239]:
@retry(wait_fixed=2000, stop_max_attempt_number=10)
def scrape_url(url, session, products, all_url_info):
    page = session.get(url)
    soup = BeautifulSoup(page.content)
    product_containers = soup.find_all('div', {'class' : 'productQvContainer'})
    main_category = all_url_info[url]['main_category']
    sub_category = all_url_info[url]['sub_category']
    sub_sub_category = all_url_info[url]['sub_sub_category']
    for product_container in product_containers:
        try:
            product, product_name = get_single_product(soup, product_container, main_category, sub_category, sub_sub_category)
            products[product_name] = product
        except Exception as exc:
            print(url, product_containers.index(product_container))
            print(exc, '\n')
    return(products)

In [240]:
def get_single_product(soup, product_container, main_category, sub_category, sub_sub_category):
    product = {}
    product['url'] = 'https://www.ulta.com' + product_container.find('a', {'class' : 'product'}).get('href')
    product['id'] = product_container.find('span', {'class' : 'prod-id'}).text.strip()
    product['brand'] = product_container.find('h4', {'class' : 'prod-title'}).text.strip()
    product['desc'] = product_container.find('p', {'class' : 'prod-desc'}).text.strip()
    product_name = product['brand'] + ' ' + product['desc']
    if product_container.find('label', {'class' : 'sr-only'}) is not None:
        product['rating'] = product_container.find('label', {'class' : 'sr-only'}).text.split(' ')[0]
    if product_container.find('span', {'class' : 'prodCellReview'}) is not None:
        product['number_of_reviews'] = re.findall(r'\b\d+\b', product_container.find('span', {'class' : 'prodCellReview'}).text)[0]
    if product_container.find('div', {'class' : 'productSale'}) is None:
        product['sale'] = 0
        product['price'] = product_container.find('span', {'class' : 'regPrice'}).text.strip()
    else:
        product['sale'] = 1
        product['price'] = product_container.find('span', {'class' : 'pro-old-price'}).text.strip()
        product['sale_price'] = product_container.find('span', {'class' : 'pro-new-price'}).text.strip()
    if '.97' in product['price']:
        product['secret_sale'] = 1
    else:
        product['secret_sale'] = 0
    if product_container.find('div', {'class' : 'product-detail-offers'}) is not None:
        product['offers'] = product_container.find('div', {'class' : 'product-detail-offers'}).text.strip()
    product['main_category'] = main_category
    product['sub_category'] = sub_category
    product['sub_sub_category'] = sub_sub_category
    return(product, product_name)

In [256]:
products = {}
for url in list(all_url_info.keys()):
    products = scrape_url(url, session, products, all_url_info)

In [257]:
len(products)

16656

In [246]:
products2 = {}
for url in all_url_info_old.keys():
    products2 = scrape_url(url, session, products2, all_url_info_old)

In [277]:
all_url_info

{'https://www.ulta.com/makeup-face?N=26y3&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': '_'},
 'https://www.ulta.com/makeup-face-foundation?N=26y5&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': 'foundation'},
 'https://www.ulta.com/makeup-face-powder?N=26y8&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': 'face powder'},
 'https://www.ulta.com/makeup-face-concealer?N=26y6&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': 'concealer'},
 'https://www.ulta.com/makeup-face-color-correcting?N=uo37yr&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': 'color correcting'},
 'https://www.ulta.com/makeup-face-primer?N=26y4&No=0&Nrpp=500': {'main_category': 'makeup',
  'sub_category': 'face',
  'sub_sub_category': 'face primer'},
 'https://www.ulta.com/makeup-face-bb-cc-creams?N=277u

In [275]:
products['Maybelline Expert Wear Eyeshadow']

KeyError: 'Maybelline Expert Wear Eyeshadow'

In [273]:
product

'Delectable Triple Coconut Cream'

In [274]:
for product in products2:
    if product not in products:
        print(product)
        print(products2[product]['main_category'], products2[product]['sub_category'], products2[product]['sub_sub_category'])

Dionis Goat Milk Beauty Balm
makeup face _
Tarte Sugar Rush - Lid Poppers Glitter & Adhesive
makeup eyes _
Maybelline Expert Wear Eyeshadow
makeup eyes _
Ofra Cosmetics Bo$$y Eyes Liquid Eyeshadow
makeup eyes _
Makeup Revolution Eye Chrome
makeup eyes _
e.l.f. Cosmetics Best Friend Eyeshadow Duo
makeup eyes _
e.l.f. Cosmetics Runway Ready Lip Palette 2
makeup eyes _
ICONIC LONDON Chrome Flash Eye Pot
makeup eyes _
Smashbox Crystalized Always On Liquid Eyeshadow
makeup eyes _
Buxom Empty Single Eyeshadow Compact
makeup eyes _
Holika Holika Foil Shock Shadow
makeup eyes _
Wet n Wild MegaJelly Eyeshadow
makeup eyes _
Touch In Sol Metallist Liquid Topper Trio #2 Minette Look
makeup eyes _
CoverGirl Exhibitionist Liquid Glitter Shadow
makeup eyes _
Wunder2 Pure Pigments
makeup eyes _
e.l.f. Cosmetics Sculpting Silk Eyeshadow
makeup eyes _
L'Oréal Chromatic Bronze Loose Pigments
makeup eyes _
Touch In Sol Metallist Liquid Topper Trio #1 Romantic Rose Look
makeup eyes _
e.l.f. Cosmetics Aqua 

In [11]:
ulta_df = pd.DataFrame.from_dict(products).transpose()

In [12]:
secret_sales = copy.deepcopy(ulta_df.query('secret_sale == 1 & sale == 0'))

In [13]:
len(secret_sales)

220

In [16]:
lowest_price = []
get_available_options = []
i = 0
for item in secret_sales['price'].tolist():
    lowest_price.append(item.split(' - ')[0])
    if len(item.split(' - ')) != 1:
        get_available_options.append(secret_sales['url'].tolist()[i])
    i = i + 1

In [51]:
driver = webdriver.Chrome(r'C:\Users\elerm\Downloads\chromedriver_win32\chromedriver.exe')

In [104]:
products_in_stock = {}
for url in get_available_options:
    variants_in_stock = {}
    temp = {}
    driver.get(url)
    for product_variant in driver.find_elements_by_class_name('ProductSwatchImage__variantHolder'):
        try:
            product_variant.click()
        except:
            next
        else:
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source)
            price = soup.find('meta', {'property' : 'product:price:amount'}).get('content')
            if price.endswith('.97'):
                option = soup.find('meta', {'property' : 'product:color'}).get('content')
                if option == '':
                    option_tag = soup.find('div', {'class' : 'ProductDetail__colorPanel'}).find_all('span')[1]
                    if option_tag is not None:
                        option = option_tag.text
                if option == '':
                    option = 'NA'
                if soup.find('div', {'class' : 'ProductDetail__availabilitySection ProductDetail__availabilitySection--error'}) is None:
                    temp[option] = price
    if bool(temp):
        print('temp not empty!')
        for key, value in temp.items():
            variants_in_stock.setdefault(value, set()).add(key)
        for key, value in variants_in_stock.items():
            new_value = ", ".join(value)
            variants_in_stock[key] = new_value
        products_in_stock[driver.title[:-14]] = variants_in_stock
    else:
        next

temp not empty!


In [105]:
products_in_stock

{'ARMANI Armani Code Profumo Parfum': {'64.97': '2.0 oz', '80.97': '3.7 oz'}}

In [68]:
variants_in_stock.items()

dict_items([('2.0 oz', '64.97'), ('3.7 oz', '80.97')])

In [76]:
d = {'option1' : 'price1', 'option2' : 'price2', 'option3' : 'price1', 'option4' : 'price2', 'option5' : 'price3'}

In [77]:
v = {}

for key, value in d.items():
    v.setdefault(value, set()).add(key)

In [83]:
for key, value in v.items():
    new_value = ", ".join(value)
    v[key] = new_value

In [84]:
v

{'price1': 'option1, option3',
 'price2': 'option4, option2',
 'price3': 'option5'}

In [80]:
a = {'a', 'b', 'c'}

In [None]:
    else:
        products_in_stock[driver.title[:-14]] = ", ".join(options_in_stock)

In [63]:
bool(products_in_stock)

False

In [62]:
products_in_stock.keys()

dict_keys([])

In [48]:
def check_equal(iterator):
    iterator = iter(iterator)
    try:
        first = next(iterator)
    except StopIteration:
        return True
    return all(first == rest for rest in iterator)

In [56]:
check_equal(prices_in_stock)

False

In [47]:
products_in_stock

{'ARMANI Armani Code Profumo Parfum': '2.0 oz, 3.7 oz'}

In [None]:
driver.close()
driver.quit()

In [None]:
products_and_stock

In [None]:
stock_df = pd.DataFrame({'name' : list(products_and_stock.keys()), 'options_available' : list(products_and_stock.values())}).set_index('name')

In [None]:
df = secret_sales.join(stock_df)

In [None]:
df['price'] = lowest_price

In [None]:
df = df.fillna(' ').reset_index().rename(columns={'index' : 'name', 'desc' : 'product'})

In [None]:
df = df[['main_category', 'sub_category', 'name', 'brand', 'product', 'price', 'offers', 'options', 'options_available', 'rating', 'number_of_reviews', 'url']]

In [None]:
sheet_id = '16-aJIGT4NZAxVfcix5whLo95WuChzfO9itpuS75rlB4'
#sheet_id = '1UoUZGJbZA_HUVGGPoirnkANGt9dRLa9rx-uz6yCDwws'

In [None]:
#change this by your sheet ID
gsheetId = '16-aJIGT4NZAxVfcix5whLo95WuChzfO9itpuS75rlB4'
#'1xjdPm0k3qvNR5LgbdxUmMqnSkGT5DYnGxNlZc6swPOU'

#change the range if needed
SAMPLE_RANGE_NAME = 'A1:AA20000'

def Create_Service(client_secret_file, api_service_name, api_version, *scopes):
    global service
    SCOPES = [scope for scope in scopes[0]]
    #print(SCOPES)
    
    cred = None

    if os.path.exists('token_write.pickle'):
        with open('token_write.pickle', 'rb') as token:
            cred = pickle.load(token)

    if not cred or not cred.valid:
        if cred and cred.expired and cred.refresh_token:
            cred.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(client_secret_file, SCOPES)
            cred = flow.run_local_server()

        with open('token_write.pickle', 'wb') as token:
            pickle.dump(cred, token)

    try:
        service = build(api_service_name, api_version, credentials=cred)
        print(api_service_name, 'service created successfully')
        #return service
    except Exception as e:
        print(e)
        #return None
        
# change 'my_json_file.json' by your downloaded JSON file.
Create_Service('unmindful_credentials.json', 'sheets', 'v4', ['https://www.googleapis.com/auth/spreadsheets'])

def Clear_Sheet():
    result_clear = service.spreadsheets().values().clear(
        spreadsheetId=gsheetId,
        range=SAMPLE_RANGE_NAME,
        body = {}
    ).execute()
    print('Sheet successfully cleared')

def Export_Data_To_Sheets():
    response_date = service.spreadsheets().values().update(
        spreadsheetId=gsheetId,
        valueInputOption='RAW',
        range=SAMPLE_RANGE_NAME,
        body=dict(
            majorDimension='ROWS',
            values=df.T.reset_index().T.values.tolist())
    ).execute()
    print('Sheet successfully updated')

Clear_Sheet()
Export_Data_To_Sheets()

In [None]:
my_range = {
    'sheetId': 0,
    'startRowIndex': 0,
    'startColumnIndex': 0,
    'endRowIndex': len(df) + 1,
    'endColumnIndex': 11
}

In [None]:
updateFilterViewRequest = {
    'updateFilterView': {
        'filter': {
            'filterViewId': '1349307930',
            'range': my_range
        },
        'fields': {
            'paths': 'range'
        }
    }
}

In [None]:
addFilterViewRequest = {
    'addFilterView': {
        'filter': {
            'title': 'sale_filter',
            'range': my_range
        }
    }
}

In [None]:
body = {'requests': [addFilterViewRequest]}
service.spreadsheets().batchUpdate(spreadsheetId=gsheetId, body=body).execute()

In [None]:
body = {'requests': [updateFilterViewRequest]}
service.spreadsheets().batchUpdate(spreadsheetId=gsheetId, body=body).execute()