In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, ElementNotInteractableException, NoSuchElementException
import time
import re
import json
from wakepy import keepawake
from scraper_utils import *

In [3]:
# initialize web driver
driver = webdriver.Chrome()
# necessary to maximize window, otherwise find_element will not work
driver.maximize_window()

# access Aldi's Instacart homepage
driver.get('https://www.instacart.com/store/aldi/storefront')

In [5]:
# Initialize the df to store all of our data
df = pd.DataFrame(columns=["Store", "Item Name", "Price", "Units", "Unit Price", "Amount", "Tags", "In Stock", "Image"])

wait = WebDriverWait(driver, 5)  # Maximum wait time of 5 seconds

# Remove modals that block website
elements_to_delete = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'ReactModalPortal')))

for element in elements_to_delete:
    driver.execute_script("arguments[0].remove();", element)

# skip over "Express" stores
try:
    express_popup = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="Need a few things fast?"] div:nth-child(3) button')
    
except NoSuchElementException:
    express_popup = None
if express_popup is None:
    # This allows us to scroll and click again
    body_element = driver.find_element(By.TAG_NAME,'body')
    driver.execute_script("arguments[0].style.overflow = 'visible';", body_element)

    # Scroll slowly to the bottom
    slow_scroll_to_bottom(driver)

    # Wait 2 seconds to allow for final loading
    time.sleep(2)


In [6]:
# Find all elements with aria-label="item carousel"
# These are the product groupings we want to jump into
carousels = driver.find_elements(By.CSS_SELECTOR, '[aria-label="item carousel"]')

carousels, mapping = carousel_filter(carousels)

# These divs contain the buttons we need to click on within each item carousel
portals = driver.find_elements(By.CLASS_NAME, '__reakit-portal')

# Some reakit portals do not correspond to carousels
fake_labels = ['Welcome', 'Join an Instacart+ family account dialog', 'Cart', 'Coupon',
            'Use backup payment method for added items?', '100% satisfaction guarantee', '']

reakit_portals = reakit_filter(portals, fake_labels, mapping)

# List of dataframes to be concatenated later.
to_concat = []



In [8]:
# click on the first carousel and process its items 
# Iterate through each div
# If we remove the useless portals, we can start from 0
j = 0

carousel = carousels[0]
print(j)
try:
    product_page = carousel.find_element(By.CSS_SELECTOR, '[aria-label="page 1 of 1"]')
except NoSuchElementException:
    product_page = None

if product_page is not None:
    print(carousel.find_element(By.CSS_SELECTOR, 'h2').text)
    
    product_list = carousel.find_elements(By.CSS_SELECTOR, 'div:nth-child(1) ul li')
    for product in product_list:
        # Get the image representing the item
        src_value = get_image(product)

        # Get the other textual data about the item
        text_list, unit_price, individual_weight, stock = get_text_data(product)

        # Parse the text data into useful data
        tags, price, name, units, src = parse_text_data(text_list, src_value)

        # Add our dataframe of data to a list of dataframes. These will be concatenated to the full df later.
        cols = df.columns
        row = pd.DataFrame(data=[['Costco', name, price, units, unit_price, individual_weight, tags, stock, src]],
                        columns=cols)

        to_concat.append(row)
else:
    print(carousel.find_element(By.CSS_SELECTOR, 'h2').text)
    # Scroll to where the carousel is located and click to expand its items
    flag = True
    try:
        scroll_to_carousel_and_click(carousel, driver)
    except NoSuchElementException:
        flag = False

    if flag:
        # Set a variable equal to the modal that opens
        reakit_id = reakit_portals[j].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1)').get_attribute('id')

        modal = driver.find_element(By.CSS_SELECTOR, f'div#{reakit_id} > div:nth-child(2)')

        # Scroll to the bottom of the modal to load all of its elements
        # Careful here: may not load all products. We may have to add slower scrolling
        scroll_to_bottom_of_modal(modal, driver)

        # Get all li objects from the list in the modal. These are the grocery items.
        product_list = modal.find_elements(By.CSS_SELECTOR, 'div:nth-child(1) ul li')

        # Iterate through the listed products
        for product in product_list:
            # Get the image representing the item
            src_value = get_image(product)

            # Get the other textual data about the item
            text_list, unit_price, individual_weight, stock = get_text_data(product)

            # Parse the text data into useful data
            tags, price, name, units, src = parse_text_data(text_list, src_value)

            # Add our dataframe of data to a list of dataframes. These will be concatenated to the full df later.
            cols = df.columns
            row = pd.DataFrame(data=[['Costco', name, price, units, unit_price, individual_weight, tags, stock, src]],
                            columns=cols)

            to_concat.append(row)

        # close out of current carousel 
        close_button = reakit_portals[j].find_element(By.CSS_SELECTOR, '[aria-label="Close"]')
        close_button.click()
    j += 1


0
Fresh Fruit


In [None]:
# add to the data frame
# Add all of our results to the full dataframe
df = pd.concat([df] + to_concat, ignore_index=True)

# Filter out missing data
df = df_filter(df)

# Save the dataframe to csv
df.to_csv("demo.csv", index=False)
# Close the browser
driver.quit()