In [35]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, ElementNotInteractableException, NoSuchElementException
import time
import re
import json
from wakepy import keepawake
import pandas as pd

In [36]:
# Initialize the df to store all of our data
df = pd.DataFrame(columns=["Store", "Item Name", "Price", "Units", "Unit Price", "Amount", "Tags", "In Stock", "Image"])

In [37]:
# Get the image representing the object
def get_image(ele):
    src_value = ""
    try:
        img_tag = ele.find_element(By.CSS_SELECTOR, 'img')
        src_value = img_tag.get_attribute('srcset')
    except NoSuchElementException:
        src_value = ""
        
    return src_value

# Get the text data that stores name, price, etc.
def get_text_data(ele):
    # Initialize the data to return as empty strings. If the desired data doesn't exist,
    # return nothing for it.
    text_list = []
    unit_price = ""
    individual_weight = ""
    stock = ""
    
    try:
        # Find all leaf-node spans (no children spans)
        text_data = ele.find_elements(By.CSS_SELECTOR, 'span:not(:has(span))')

        # Filter by length > 2. For some reason, price is stored as "$0.18",
        # and then also "$", "0", "1", "8", so we want to ignore these
        for span in text_data:
            html = span.text
            if len(html) > 2:
                text_list.append(html)                    

        # Get the unit price. Not all items have this attribute
        try:
            unit_price = ele.find_element(By.CSS_SELECTOR, 'div[title]').text
        except NoSuchElementException:
            unit_price = ""

            
        # There are at most 4 text objects below the item, stored in divs.
        # The bottom-most one will tell us the weight. We start at 4 (the lowest)
        # which will give us the "in stock" information (if any).
        # So long as "stock" isn't mentioned, it's the weight, and we can exit the loop.
        k = 4
        while k > 0:
            try:
                # Try to find the element with the specified CSS selector
                individual_weight_element = ele.find_element(By.CSS_SELECTOR, f'div[aria-label="Product"] a div:nth-child(2) div:nth-child({k})')

                if 'stock' in individual_weight_element.text:
                    stock = individual_weight_element.text
                    k -= 1
                    continue


                else:
                    individual_weight = individual_weight_element.text
                    k = 0
            except NoSuchElementException:
                # If not found, set individual_weight to an empty string
                individual_weight = ""
                k -= 1

    except NoSuchElementException:
        return text_list, unit_price, individual_weight, stock
        
    return text_list, unit_price, individual_weight, stock

# Scroll to the desired carousel within the web page and click on the button
# to expand the listed items.
def scroll_to_carousel_and_click(car):
    # Locate the desired div
    first_div_child = car.find_element(By.CSS_SELECTOR, 'div:nth-child(1)')
    second_div = first_div_child.find_element(By.CSS_SELECTOR, 'div:nth-child(2)')
    
#     # Scroll to the div containing our carousel of interest
#     ActionChains(driver).move_to_element(second_div).perform()

    # Scroll to put element in middle of screen

    element_y = second_div.location['y']

    # Get the height of the browser window
    window_height = driver.execute_script('return window.innerHeight;')

    # Calculate the offset to scroll the element to the middle of the page
    scroll_offset = element_y - (window_height / 2)

    # Use JavaScript to scroll the page
    driver.execute_script(f'window.scrollTo(0, {scroll_offset});')

    # Expand the items so we can view them all
    first_button = second_div.find_element(By.CSS_SELECTOR, 'button:nth-child(1)')
    first_button.click()
    
# Scroll to the bottom of the given modal
def scroll_to_bottom_of_modal(mod):
    time.sleep(0.5)
    # Scrolling to the bottom may take multiple tries as elements load slowly.
    # Two scrolls works for now but may need to add more
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", mod)
    time.sleep(1)
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", mod)
    
# Parse the text into a useful format.
def parse_text_data(text_data_list, src_data):
    # Initialize the data to return as empty strings. If the desired data doesn't exist,
    # return nothing for it.
    tags = []
    price = ""
    name = ""
    units = ""

    # The src is a list of links. We just get the first one.
    src = src_value.split()
    if len(src) != 0:
        src = src[0]
    else:
        src = ""

    # Remove empty strings.
    text_data_list = [element for element in text_data_list if element.strip() != ""]
    
    print(text_data_list)

    # The price will be the only element to start with "$". This line finds the index of that element
    price_index = next(i for i, element in enumerate(text_data_list) if element.startswith("$"))
    
    try:
        # If there are units for the item, it starts with a '/' or 'each'. Not all items have this. But, if
        # it exists, we store it and then remove it from the list of data.
        units_index = next(i for i, element in enumerate(text_data_list) if element.startswith("/") or element.startswith("each"))
        units = text_data_list.pop(units_index)
    except StopIteration:
        units_index = None 

    # Loop through the text data
    for i in range(len(text_data_list)):
        # If an item is before the price index, it is a "tag". Things like "organic", "non-gmo"
        if i < price_index:
            tags.append(text_data_list[i])
        #  If an item is at the price index, store it as the price
        elif i == price_index:
            price = text_data_list[i]
        # Whatever is right after the price will be the item name
        elif i == price_index + 1:
            name = text_data_list[i]
            
    return tags, price, name, units, src

In [38]:
# driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome(executable_path='/Users/jakesanghavi/Downloads/chromedriver-mac-arm64/chromedriver')
driver.maximize_window()

# Open Instacart to your desired store
driver.get('https://www.instacart.com/aldi')

wait = WebDriverWait(driver, 5)  # Maximum wait time of 5 seconds

In [39]:
# Remove modals that block website
elements_to_delete = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'ReactModalPortal')))

for element in elements_to_delete:
    driver.execute_script("arguments[0].remove();", element)

# This allows us to scroll and click again
body_element = driver.find_element_by_tag_name('body')
driver.execute_script("arguments[0].style.overflow = 'visible';", body_element)

In [40]:
# Function to scroll slowly to the bottom
def slow_scroll_to_bottom():
    actions = ActionChains(driver)
    t = 0
    
    # If it's been longer than 10 seconds, something has likely gone wrong.
    # So, cut off the scroll early and go back to the top
    while t < 10:
        actions.send_keys(Keys.PAGE_DOWN).perform()
        # Adjust the sleep time based on your needs
        time.sleep(1)
        # Check if you've reached the bottom of the page
        if driver.execute_script("return (window.innerHeight + window.scrollY) >= document.body.scrollHeight;"):
            break
        
        t += 1

# Scroll slowly to the bottom
slow_scroll_to_bottom()

# Wait 2 seconds to allow for final loading
time.sleep(2)

# Find all elements with aria-label="item carousel"
# These are the product groupings we want to jump into
carousels = driver.find_elements(By.CSS_SELECTOR, '[aria-label="item carousel"]')

mapping = []
final_carousels = []
for x in range(len(carousels)):
    try:
        category_label = carousels[x].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1) h2 div:nth-child(1)')
    except NoSuchElementException:
        continue
    category_text = category_label.text
    mapping.append(category_text)
    final_carousels.append(carousels[x])

carousels = final_carousels

In [41]:
# These divs contain the buttons we need to click on within each item carousel
portals = driver.find_elements(By.CLASS_NAME, '__reakit-portal')

# Some of the reakit portals do not correspond to carousels
fake_labels = ['Welcome', 'Join an Instacart+ family account dialog', 'Cart', 'Coupon', 
               'Use backup payment method for added items?', '100% satisfaction guarantee', '']
reakit_portals = []

# If the portal has one of the above labels, don't add it to our portal list.
for p in portals:
    ele = p.find_element(By.CSS_SELECTOR, 'div:first-child div:first-child')
    label = ele.get_attribute('aria-label')
    if label not in fake_labels and label is not None:
        reakit_portals.append(p)

# Defines the way to sort the reakit portals in the same order as the carousels
def get_mapping_index(element):
    try:
        return next(
            i
            for i, item in enumerate(mapping)
            if item == element.find_element(By.CSS_SELECTOR, 'div:first-child div:first-child').get_attribute('aria-label')
        )
    except StopIteration:
        return None  # Return None if no match is found

# Filter out elements with no match in mapping
filtered_portals = [portal for portal in reakit_portals if get_mapping_index(portal) is not None]

# Sort the filtered portals based on the index in the mapping list
reakit_portals = sorted(filtered_portals, key=lambda x: get_mapping_index(x))
        
# List of dataframes to be concatenated later.
to_concat = []

# We want to start from the second reakit-portal. The first one is not useful.
# We will then iterate through each of the associated carousels.
# j=3
# j = 2

# If we remove the useless portals, we can start from 0
j = 0

# Iterate through each div
for carousel in carousels:
    print(j)
    # Scroll to where the carousel is located and click to expand its tems
    scroll_to_carousel_and_click(carousel)
    
    # Set a variable equal to the modal that opens
#     modal = reakit_portals[j].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1) div:nth-child(2)')
    reakit_id = reakit_portals[j].find_element(By.CSS_SELECTOR, 'div:nth-child(1) div:nth-child(1)').get_attribute('id')
        
    modal = driver.find_element(By.CSS_SELECTOR, f'div#{reakit_id} > div:nth-child(2)')
    
    # Scroll to the bottom of the modal to load all of its elements
    # Careful here: may not load all products. We may have to add slower scrolling
    scroll_to_bottom_of_modal(modal)

    # Get all li objects from the list in the modal. These are the grocery items.
    product_list = modal.find_elements(By.CSS_SELECTOR, 'div:nth-child(1) ul li')
        
    # Iterate through the listed products
    for product in product_list:
        # Get the image representing the item
        src_value = get_image(product)
              
        # Get the other textual data about the item
        text_list, unit_price, individual_weight, stock = get_text_data(product)

        # Parse the text data into useful data
        tags, price, name, units, src = parse_text_data(text_list, src_value)

        # Add our dataframe of data to a list of dataframes. These will be concatenated to the full df later.
        cols = df.columns
        row = pd.DataFrame(data=[["Aldi", name, price, units, unit_price, individual_weight, tags, stock, src]], columns=cols)

        to_concat.append(row)
                
    close_button = reakit_portals[j].find_element(By.CSS_SELECTOR, '[aria-label="Close"]')
    close_button.click()
#     driver.execute_script("arguments[0].click();", close_button);
    j += 1
    
# Process
## In the text_data, remove data like 'starts with each' or 'Add'.
## Tags are whatever come before $
## Price is whatever starts with $
# Item name is whatever comes right after $
# Amount or unit price is whatever comes right after Name
# Make sure to ignore empty strings

0
id-12
['$0.16', 'each (est.)', 'Bananas', 'Add']
['$2.99', 'Strawberries, Package', 'Add']
['$3.19', 'Blueberries Package', 'Add']
['$3.95', 'Mandarins, Bag', 'Add']
['$0.65', 'Avocado', 'Add']
['$2.75', 'Raspberries Package', 'Add']
['$2.55', 'Mission Mini Avocados, Bag', 'Add']
['$3.85', 'Lemons', 'Add']
['$4.93', '/pkg (est.)', 'Red Grapes', 'Add']
['$2.19', 'Blackberries', 'Add']
['$3.29', 'Honeycrisp Apples, Bag', 'Add']
['$2.65', 'Limes, Bag', 'Add']
['Organic', '$0.25', 'each (est.)', 'Organic Bananas', 'Add']
['$2.19', 'Pineapple', 'Add']
['$3.29', 'Gala Apple Bag', 'Add']
['Organic', '$3.19', 'Organic Avocado Bag', 'Add']
['$4.75', 'Cotton Candy Grapes, Package', 'Add']
['$2.75', 'Cantaloupe', 'Add']
['Organic', '$4.95', 'Organic Strawberries Package', 'Add']
['$5.67', 'each (est.)', 'Honeycrisp Apple', 'Add']
['Organic', '$3.65', 'Organic Farms Organic Blueberries, Package', 'Add']
['$4.24', 'each (est.)', 'Red Cherries, Bag', 'Add']
['$3.85', 'Cara Cara Oranges, Bag', 'Add

['$2.75', "Benton's Premium Vanilla Wafers", 'Add']
['$2.19', "Benton's Peanut Butter Filled Cookies", 'Add']
['$2.35', "Benton's Graham Crackers", 'Add']
['$3.09', 'Specially Selected Parmesan, Garlic & Herb Pita Chips', 'Add']
['$4.09', 'Takis Fuego Rolled Tortilla Chips Bag', 'Save $0.75', 'Add']
['Gluten-Free', 'Vegan', '$1.95', 'excitemint Sugar-free Gum, Arctic Blast', 'Add to cart']
4
id-171
['$2.75', "Season's Choice Seasoned French Fries", 'Add']
['$5.75', 'Kirkwood Chicken Fries', 'Add']
['$3.09', "Season's Choice Crinkle Cut Potatoes", 'Add']
['$3.29', 'Hot Pockets Hot Pocket Pepperoni Pizza', 'Add']
['$3.75', 'Bremer Regular Corn Dogs', 'Add']
['$6.95', 'Mamma Cozi Pepperoni Pizza Snacks', 'Add']
['$2.75', "Season's Choice Steak Fries", 'Add']
['$2.29', "Season's Choice Potato Puffs/Crispy Tots", 'Add']
['$3.49', 'Appetitos Cream Cheese Stuffed Jalapenos', 'Add']
['$4.39', 'Fusia Chicken Potstickers', 'Add']
['$3.49', 'Appetitos Mozzarella Sticks', 'Add']
['$5.49', 'Casa Ma

['$2.39', 'Brookdale Pork Luncheon Meat', 'Add']
['$1.45', 'Sweet Harvest Mandarin Oranges in Light Syrup', 'Add']
['$1.39', 'Sweet Harvest Pinaepple Chunks in Pineapple Juice', 'Add']
['$2.45', 'Lunch Buddies Strawberry Applesauce Cups', 'Add']
['$1.35', 'Happy Harvest Canned Mushrooms', 'Add']
['$1.35', "Baker's Corner Canned Pumpkin", 'Add']
['$2.09', "Dakota's Pride Maple Baked Beans", 'Add']
['$1.09', 'Casa Mamita Fat Free Refried Beans', 'Add']
['$0.89', 'Pueblo Lindo Chopped Green Chiles', 'Add']
['$1.29', 'Happy Harvest Fire Roasted Diced Tomatoes', 'Add']
['$0.99', 'Happy Harvest Diced Tomatoes with Basil, Garlic and Oregano', 'Add']
['$0.89', "Dakota's Pride Mild Chili Beans", 'Add']
['$2.65', 'Tuscan Garden Banana Peppers', 'Add']
['$1.45', 'Sweet Harvest Peach Slices in 100% Juice', 'Add']
['Organic', '$1.09', 'Simply Nature Organic Black Beans', 'Add']
['$2.75', 'Sweet Harvest Cinnamon Applesauce', 'Add']
['$2.85', 'Tuscan Garden Marinated Artichoke Quarters', 'Add']
['$1.

['$2.55', 'Friendly Farms Original Sweetened Almondmilk', 'Add']
['$2.55', 'Friendly Farms Vanilla Sweetened Almondmilk', 'Add']
['Organic', '$2.65', 'Simply Nature Organic Original Soymilk', 'Add']
11
id-435
['$12.45', '/pkg (est.)', 'Kirkwood Fresh Family Pack Chicken Breasts', 'Add']
['$7.18', 'each (est.)', 'Kirkwood Fresh Chicken Tenderloins', 'Add']
['$4.75', 'Kirkwood Fresh 93% Lean Ground Turkey', 'Add']
['$6.11', '/pkg (est.)', 'Kirkwood Fresh Chicken Breasts', 'Add']
['$3.85', 'Appleton Farms Turkey Bacon', 'Add']
['$5.59', '/pkg (est.)', 'Kirkwood Fresh Thin Sliced Chicken Breasts', 'Add']
['$5.36', '/pkg (est.)', 'Kirkwood Fresh Family Pack Chicken Drumsticks', 'Add']
['$8.19', '/pkg (est.)', 'Kirkwood Family Pack Chicken Thighs', 'Add']
['$10.82', '/pkg (est.)', 'Family Pack Antibiotic Free Boneless Skinless Chicken Breasts', 'Add']
['$7.69', 'Kirkwood 85/15 Fresh Ground Turkey', 'Add']
['$6.24', '/pkg (est.)', 'Kirkwood Fresh Chicken Drumsticks', 'Add']
['$11.16', '/pkg (

14
id-537
['$2.15', 'Goldhen Grade A Large Eggs', 'Add']
['$2.99', 'Strawberries, Package', 'Add']
['$3.65', 'PurAqua Purified Water', 'Add']
['$2.89', 'Friendly Farms Whole Milk', 'Add']
['$1.45', "L'oven Fresh White Bread", 'Add']
['$0.16', 'each (est.)', 'Bananas', 'Add']
['$9.19', '/pkg (est.)', 'Sea Queen Fresh Never Frozen Atlantic Salmon', 'Add']
['$6.75', 'Boulder 10" Ultra Heavy Duty Paper Plate', 'Add']
['$4.75', 'Kirkwood Fresh 93% Lean Ground Turkey', 'Add']
['$6.05', 'Little Journey Sensitive Baby Wipes', 'Add']
['$5.49', 'Breakfast Best Breakfast Croissant', 'Add']
['$3.65', "Nature's Nectar Premium Orange Juice Not From Concentrate No Pulp", 'Add']
['$7.25', 'Boulder Lavender Scent Flex Odor Control Kitchen Bag', 'Add']
['$2.89', 'Friendly Farms 2% Milk', 'Add']
['$4.39', 'Radiance Triple Chamber Pacs Ultra Dishwasher', 'Add']
['$2.05', "Baker's Treat Glazed Honey Buns", 'Add']
['$1.99', 'Little Salad Bar Garden Salad', 'Add']
['$1.65', "Breakfast Best Heat 'N Serve Saus

In [42]:
# Close the browser
driver.quit()

In [43]:
# Add all of our results to the full dataframe
df = pd.concat([df] + to_concat, ignore_index=True)

# Sometime there is no unit price, but it gets set anyways
# It always gets set the the amount/weight in this case
# So, if these two are the same, then set unit price 
# to be an empty string
mask = df['Unit Price'] == df['Amount']
df.loc[mask, 'Unit Price'] = ""
df

Unnamed: 0,Store,Item Name,Price,Units,Unit Price,Amount,Tags,In Stock,Image
0,Aldi,Bananas,$0.16,each (est.),$0.49 / lb,About 0.33 lb each,[],,https://www.instacart.com/image-server/257x257...
1,Aldi,"Strawberries, Package",$2.99,,,1 lb,[],,https://www.instacart.com/image-server/257x257...
2,Aldi,Blueberries Package,$3.19,,,1 pint container,[],Many in stock,https://www.instacart.com/image-server/257x257...
3,Aldi,"Mandarins, Bag",$3.95,,,3 lb,[],Many in stock,https://www.instacart.com/image-server/257x257...
4,Aldi,Avocado,$0.65,,,1 each,[],,https://www.instacart.com/image-server/257x257...
...,...,...,...,...,...,...,...,...,...
572,Aldi,Radiance Heavy Duty Scrub Sponges,$1.75,,,3 ct,[],,https://www.instacart.com/image-server/257x257...
573,Aldi,Little Journey Baby Wipes Bundle,$5.19,,,216 ct,[],Many in stock,https://www.instacart.com/image-server/257x257...
574,Aldi,Simply Nature Organic Grass Fed 85/15 Ground Beef,$5.19,,,1 lb,[Organic],,https://www.instacart.com/image-server/257x257...
575,Aldi,L'oven Fresh Honey Wheat Bread,$1.95,,,20 oz,[],Many in stock,https://www.instacart.com/image-server/257x257...


In [44]:
# Save the dataframe to csv
df.to_csv("instacart_scraper_v1_results.csv")

In [45]:
print(modal)

<selenium.webdriver.remote.webelement.WebElement (session="1a6e0a90b5b40870c71e75bcaadbf9c5", element="CE876AB70E11608222546BB14E9908B8_element_6347")>
