In [1]:
import selenium
import pandas as pd
from datetime import datetime

In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from twilio.rest import Client

In [3]:
# create empty dataframe to store scraped data for this iteration
df = pd.DataFrame()

In [4]:
# define list of stores to be scraped
store_list = [ 
#               'Zipps Liquor - Conroe Walden',
              'Zipps Liquor - FM 1488',
#               'Zipps Liquor - Willis',
#               'Zipps Liquor - Conroe HWY 242',
#               'Zipps Liquor - Conroe N Frazier',
#               'Zipps Liquor - Magnolia',
#               'Zipps Liquor - Houston Grant Rd',
#               'Zipps Liquor - Navasota',
              ]

In [5]:
start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

url = 'https://shop.zippsliquor.com/shop/?subtype=whiskey&order=price+desc'   

# Set up Selenium WebDriver 
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

counter = 1

# Navigate to the URL
driver.get(url)

for store in store_list: 
    
    df_store = pd.DataFrame()
    
    try:
    
        print(store,f": ({counter}/{len(store_list)})")


        # -------------------------- NAVIGATE TO STORE -------------------------------

        # Wait for the Change Store button to be clickable
        change_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@class='button ch-button']"))
        )

        # Click the Change button to open the store selection modal
        change_button.click()

        # Wait for the store selection modal to appear
        store_button = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//div[@aria-label='{store}']//button[@aria-label='Select business']"))
        )

        # Click the Change button to open the store selection modal  
        store_button.click()


        store_names = driver.find_elements(By.XPATH, "(//div[@class='current-store-info']/strong)[1]")

        for name in store_names:
            store_name = name.text


        # -------------------------- PAGE 1 -------------------------------


        # Scrape the bottles from the page 
        bottle_list = []
        bottles = driver.find_elements(By.XPATH, "//div[@class='ch-product-name']")
        for bottle in bottles:
            bottle_list.append(bottle.text)

        # Scrape the prices from the page 
        price_list = []
        prices = driver.find_elements(By.XPATH, "//span[@class='ch-single-product-price'] | //div[@class='price-range']")
        for price in prices:
            price = float(price.text.split(' ')[0].replace('$','').replace(',',''))
            price_list.append(price)
            
        df_store['bottles'] = bottle_list
        df_store['price'] = price_list
        df_store['store'] = store_name
        df_store['timestamp'] = start_time

        df = df.append(df_store)

        # -------------------------- PAGE 2 -------------------------------    

        # move to next page
        next_button = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//a[@data-hook='search-results-next-page']"))
        )

        # Click the Change button to open the store selection modal
        next_button.click()

        # Scrape the bottles from the page 
        bottle_list = []
        bottles = driver.find_elements(By.XPATH, "//div[@class='ch-product-name']")
        for bottle in bottles:
            bottle_list.append(bottle.text)

        # Scrape the prices from the page 
        price_list = []
        prices = driver.find_elements(By.XPATH, "//span[@class='ch-single-product-price'] | //div[@class='price-range']")
        for price in prices:
            price = float(price.text.split(' ')[0].replace('$','').replace(',',''))
            price_list.append(price)

            
        df_store['bottles'] = bottle_list
        df_store['price'] = price_list
        df_store['store'] = store_name
        df_store['timestamp'] = start_time

        df = df.append(df_store)


        # ----------------------- move back

        back_button = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//a[@data-hook='search-results-previous-page']"))
        )

        # Click the Change button to open the store selection modal
        back_button.click()
    
        back_button = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, f"//a[@data-hook='search-results-previous-page']"))
        )

        # Click the Change button to open the store selection modal
        back_button.click()
    
    except:
        print(f'{store} failed --------')
    
    counter += 1

    
# Close the browser
driver.quit()

Zipps Liquor - FM 1488 : (1/1)
Zipps Liquor - FM 1488 failed --------


### Check for Bottles

In [6]:
df['store'].value_counts()

Zipps Liquor - FM 1488    36
Name: store, dtype: int64

In [None]:
bottles_to_find = ['colonel',
                   'caribou',
                   'midwinter',
                   'russel',
                   'elijah',
                   'craig 18',
                   'blanton',
                   'weller',
                   'EH taylor',
                   'heritage',
                   'blood oath',
                   'Jameson Gold Reserve',
                   'Four Roses Single Barrel Straight Bourbon Whiskey',
                   'Henry McKenna 10',
                   'King Ranch',
                   'Michter',
                   'Fitzgerald'
                  ]

In [None]:
for bottle in bottles_to_find:
    positive = df['bottles'].str.contains(bottle, na=False, case=False)
    negative = ~df['bottles'].str.contains('larue|warehouse|Taylor Small|Craig Small', na=False, case=False)
    to_show = df[(positive) & (negative)].sort_values('store').drop_duplicates(subset=['bottles','price','store'])
    if to_show.shape[0] > 0:
        display(to_show)

### Records

In [None]:
records = pd.read_csv('/Users/john.odonnell/Python/web_scraping/Zipps/zipps_records.csv')

In [None]:
new_bottles = [i for i in df['bottles'].unique() if i not in records['bottles'].unique()]
new_bottles

In [None]:
most_recent_records = records[records['timestamp'] == records['timestamp'].max()]
new_bottles_since_last_run = [i for i in df['bottles'].unique() if i not in most_recent_records['bottles'].unique()]
new_bottles_since_last_run

In [None]:
gone_since_last_run = [i for i in most_recent_records['bottles'].unique() if i not in df['bottles'].unique()]
gone_since_last_run

In [None]:
output = pd.concat([records,df],ignore_index=True)

In [None]:
output.to_csv('/Users/john.odonnell/Python/web_scraping/Zipps/zipps_records.csv',index=False)

### View Top Bottles

In [None]:
pd.set_option('display.max_rows', None)
df.sort_values('price',ascending=False)