In [326]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, ElementNotInteractableException, NoSuchElementException
import time
import re
import json
from wakepy import keepawake
import pandas as pd

In [327]:
# Initialize the df to store all of our data
df = pd.DataFrame(columns=["Store", "Item Name", "Price", "Units", "Unit Price", "Amount", "Tags", "In Stock", "Image"])

In [328]:
# Get the image representing the object
def get_image(ele):
    src_value = ""
    try:
        img_tag = ele.find_element(By.CSS_SELECTOR, 'img')
        src_value = img_tag.get_attribute('srcset')
    except NoSuchElementException:
        src_value = ""
        
    return src_value

# Get the text data that stores name, price, etc.
def get_text_data(ele):
    # Initialize the data to return as empty strings. If the desired data doesn't exist,
    # return nothing for it.
    text_list = []
    unit_price = ""
    individual_weight = ""
    stock = ""
    
    try:
        # Find all leaf-node spans (no children spans)
        text_data = ele.find_elements(By.CSS_SELECTOR, 'span:not(:has(span))')

        # Filter by length > 2. For some reason, price is stored as "$0.18",
        # and then also "$", "0", "1", "8", so we want to ignore these
        for span in text_data:
            html = span.text
            if len(html) > 2:
                text_list.append(html)                    

        # Get the unit price. Not all items have this attribute
        try:
            unit_price = ele.find_element(By.CSS_SELECTOR, 'div[title]').text
        except NoSuchElementException:
            unit_price = ""

            
        # There are at most 4 text objects below the item, stored in divs.
        # The bottom-most one will tell us the weight. We start at 4 (the lowest)
        # which will give us the "in stock" information (if any).
        # So long as "stock" isn't mentioned, it's the weight, and we can exit the loop.
        k = 4
        while k > 0:
            try:
                # Try to find the element with the specified CSS selector
                individual_weight_element = ele.find_element(By.CSS_SELECTOR, f'div[aria-label="Product"] a div:nth-child(2) div:nth-child({k})')

                if 'stock' in individual_weight_element.text:
                    stock = individual_weight_element.text
                    k -= 1
                    continue


                else:
                    individual_weight = individual_weight_element.text
                    k = 0
            except NoSuchElementException:
                # If not found, set individual_weight to an empty string
                individual_weight = ""
                k -= 1

    except NoSuchElementException:
        return text_list, unit_price, individual_weight, stock
        
    return text_list, unit_price, individual_weight, stock

# Scroll to the desired carousel within the web page and click on the button
# to expand the listed items.
def scroll_to_carousel_and_click(car):
    # Locate the desired div
    first_div_child = car.find_element(By.CSS_SELECTOR, 'div:nth-child(1)')
    second_div = first_div_child.find_element(By.CSS_SELECTOR, 'div:nth-child(2)')
    
#     # Scroll to the div containing our carousel of interest
#     ActionChains(driver).move_to_element(second_div).perform()

    # Scroll to put element in middle of screen

    element_y = second_div.location['y']

    # Get the height of the browser window
    window_height = driver.execute_script('return window.innerHeight;')

    # Calculate the offset to scroll the element to the middle of the page
    scroll_offset = element_y - (window_height / 2)

    # Use JavaScript to scroll the page
    driver.execute_script(f'window.scrollTo(0, {scroll_offset});')

    # Expand the items so we can view them all
    first_button = second_div.find_element(By.CSS_SELECTOR, 'button:nth-child(1)')
    first_button.click()
    
# Scroll to the bottom of the given modal
def scroll_to_bottom_of_modal(mod):
    time.sleep(0.5)
    # Scrolling to the bottom may take multiple tries as elements load slowly.
    # Two scrolls works for now but may need to add more
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", mod)
    time.sleep(1)
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", mod)
    
# Parse the text into a useful format.
def parse_text_data(text_data_list, src_data):
    # Initialize the data to return as empty strings. If the desired data doesn't exist,
    # return nothing for it.
    tags = []
    price = ""
    name = ""
    units = ""

    # The src is a list of links. We just get the first one.
    src = src_value.split()
    if len(src) != 0:
        src = src[0]
    else:
        src = ""

    # Remove empty strings.
    text_data_list = [element for element in text_data_list if element.strip() != ""]

    # The price will be the only element to start with "$". This line finds the index of that element
    price_index = next(i for i, element in enumerate(text_data_list) if element.startswith("$"))
    
    try:
        # If there are units for the item, it starts with a '/' or 'each'. Not all items have this. But, if
        # it exists, we store it and then remove it from the list of data.
        units_index = next(i for i, element in enumerate(text_data_list) if element.startswith("/") or element.startswith("each"))
        units = text_data_list.pop(units_index)
    except StopIteration:
        units_index = None 

    # Loop through the text data
    for i in range(len(text_data_list)):
        # If an item is before the price index, it is a "tag". Things like "organic", "non-gmo"
        if i < price_index:
            tags.append(text_data_list[i])
        #  If an item is at the price index, store it as the price
        elif i == price_index:
            price = text_data_list[i]
        # Whatever is right after the price will be the item name
        elif i == price_index + 1:
            name = text_data_list[i]
            
    return tags, price, name, units, src

In [329]:
# driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome(executable_path='/Users/jakesanghavi/Downloads/chromedriver-mac-arm64/chromedriver')
driver.maximize_window()

# Open Instacart to your desired store
driver.get('https://www.instacart.com/aldi')

wait = WebDriverWait(driver, 5)  # Maximum wait time of 5 seconds

In [330]:
# Remove modals that block website
elements_to_delete = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'ReactModalPortal')))

for element in elements_to_delete:
    driver.execute_script("arguments[0].remove();", element)

# This allows us to scroll and click again
body_element = driver.find_element_by_tag_name('body')
driver.execute_script("arguments[0].style.overflow = 'visible';", body_element)

In [331]:
# Function to scroll slowly to the bottom
def slow_scroll_to_bottom():
    actions = ActionChains(driver)
    t = 0
    
    # If it's been longer than 10 seconds, something has likely gone wrong.
    # So, cut off the scroll early and go back to the top
    while t < 10:
        actions.send_keys(Keys.PAGE_DOWN).perform()
        # Adjust the sleep time based on your needs
        time.sleep(1)
        # Check if you've reached the bottom of the page
        if driver.execute_script("return (window.innerHeight + window.scrollY) >= document.body.scrollHeight;"):
            break
        
        t += 1

# Scroll slowly to the bottom
slow_scroll_to_bottom()

# Wait 2 seconds to allow for final loading
time.sleep(2)

# Find all elements with aria-label="item carousel"
# These are the product groupings we want to jump into
carousels = driver.find_elements(By.CSS_SELECTOR, '[aria-label="item carousel"]')

In [332]:
# These divs contain the buttons we need to click on within each item carousel
portals = driver.find_elements(By.CLASS_NAME, '__reakit-portal')

# Some of the reakit portals do not correspond to carousels
fake_labels = ['Welcome', 'Join an Instacart+ family account dialog', 'Cart', 'Coupon', 
               'Use backup payment method for added items?', '100% satisfaction guarantee', '']
reakit_portals = []

# If the portal has one of the above labels, don't add it to our portal list.
for p in portals:
    ele = p.find_element(By.CSS_SELECTOR, 'div:first-child div:first-child')
    label = ele.get_attribute('aria-label')
    if label not in fake_labels and label is not None:
        reakit_portals.append(p)
        
# List of dataframes to be concatenated later.
to_concat = []

# We want to start from the second reakit-portal. The first one is not useful.
# We will then iterate through each of the associated carousels.
# j=3
# j = 2

# If we remove the useless portals, we can start from 0
j = 0

# Iterate through each div
for carousel in carousels:
    print(j)
    # Scroll to where the carousel is located and click to expand its tems
    scroll_to_carousel_and_click(carousel)
    
    # Set a variable equal to the modal that opens
#     modal = reakit_portals[j].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1) div:nth-child(2)')
    reakit_id = reakit_portals[j].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1)').get_attribute('id')
    
    modal = driver.find_element(By.CSS_SELECTOR, f'div#{reakit_id} > div:nth-child(2)')
    
    # Scroll to the bottom of the modal to load all of its elements
    # Careful here: may not load all products. We may have to add slower scrolling
    scroll_to_bottom_of_modal(modal)

    # Get all li objects from the list in the modal. These are the grocery items.
    product_list = modal.find_elements(By.CSS_SELECTOR, 'div:nth-child(1) ul li')
    
    if len(product_list) == 0:
        print(modal.get_attribute('class'))
        
    # Iterate through the listed products
    for product in product_list:
        # Get the image representing the item
        src_value = get_image(product)
              
        # Get the other textual data about the item
        text_list, unit_price, individual_weight, stock = get_text_data(product)

        # Parse the text data into useful data
        tags, price, name, units, src = parse_text_data(text_list, src_value)

        # Add our dataframe of data to a list of dataframes. These will be concatenated to the full df later.
        cols = df.columns
        row = pd.DataFrame(data=[["Aldi", name, price, units, unit_price, individual_weight, tags, stock, src]], columns=cols)

        to_concat.append(row)
                
    close_button = reakit_portals[j].find_element(By.CSS_SELECTOR, '[aria-label="Close"]')
    close_button.click()
#     driver.execute_script("arguments[0].click();", close_button);
    j += 1
    
# Process
## In the text_data, remove data like 'starts with each' or 'Add'.
## Tags are whatever come before $
## Price is whatever starts with $
# Item name is whatever comes right after $
# Amount or unit price is whatever comes right after Name
# Make sure to ignore empty strings

Fresh Fruit
Fresh Vegetables
Best Sellers
Snacks
Frozen Snacks
Cheese
Milk
Condiments
Baking and Cooking
Bread
Breakfast
Buy any 2, save $1
Buy any 2, save $1
Buy any 2, save $1
Canned Goods
Beef
Deli Meats
Yogurt
Frozen Entrees
Ready Meals
Poultry
Customer Favorites
Spend $25, save $5
23
0
id-12
1
id-43
2
id-76
3
id-139
4
id-173
5
id-179
6
id-239


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div#id-239 > div:nth-child(2)"}
  (Session info: chrome=121.0.6167.160)


In [None]:
# Close the browser
driver.quit()

In [333]:
# Add all of our results to the full dataframe
df = pd.concat([df] + to_concat, ignore_index=True)

# Sometime there is no unit price, but it gets set anyways
# It always gets set the the amount/weight in this case
# So, if these two are the same, then set unit price 
# to be an empty string
mask = df['Unit Price'] == df['Amount']
df.loc[mask, 'Unit Price'] = ""
df

Unnamed: 0,Store,Item Name,Price,Units,Unit Price,Amount,Tags,In Stock,Image
0,Aldi,Bananas,$0.16,each (est.),$0.49 / lb,About 0.33 lb each,[],,https://www.instacart.com/image-server/257x257...
1,Aldi,"Strawberries, Package",$2.99,,,1 lb,[],,https://www.instacart.com/image-server/257x257...
2,Aldi,Blueberries Package,$3.19,,,1 pint container,[],Many in stock,https://www.instacart.com/image-server/257x257...
3,Aldi,"Mandarins, Bag",$3.95,,,3 lb,[In Season],Many in stock,https://www.instacart.com/image-server/257x257...
4,Aldi,Organic Bananas,$0.25,each (est.),$0.75 / lb,About 0.33 lb each,"[Organic, Non-GMO]",,https://www.instacart.com/image-server/257x257...
...,...,...,...,...,...,...,...,...,...
233,Aldi,Emporium Selection Specialty Shredded Gouda Ch...,$3.25,,,8 oz,[],,https://www.instacart.com/image-server/257x257...
234,Aldi,Happy Farms Deli Sliced Mozzarella Cheese,$2.09,,,8 oz,[],Many in stock,https://www.instacart.com/image-server/257x257...
235,Aldi,Happy Farms Mild Cheddar Snack Stick,$3.55,,,9 oz,[],Many in stock,https://www.instacart.com/image-server/257x257...
236,Aldi,Daisy Small Curd Cottage Cheese,$3.95,,,24 oz,"[Cow Milk, Keto]",Many in stock,https://www.instacart.com/image-server/257x257...


In [None]:
# Save the dataframe to csv
df.to_csv("instacart_scraper_v1_results.csv")

In [None]:
print(modal)