In [95]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import urllib.parse
import undetected_chromedriver as uc

import os
import time
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests

In [96]:
# Configure Chrome Webdriver

chrome_install = ChromeDriverManager().install()

In [97]:
# Initialize Chrome WebDriver
options = webdriver.ChromeOptions()
#options.add_argument('--headless')

options.add_argument(
    "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
)

browser = webdriver.Chrome(options=options)

In [98]:
# Setup search parameters
city = "montreal"
product = "Gaming PC"
min_price = 0
max_price = 5000
days_listed = 7
MAX_SCROLLS = 4

In [99]:
# Setup base URL
url = f'https://www.facebook.com/marketplace/{city}/search?query={product}&minPrice={min_price}&maxPrice={max_price}&daysSinceListed={days_listed}&sortBy=creation_time_descend&exact=false'

# Visit website
browser.get(url)

# Close login pop-up
try:
    close_button = browser.find_element(By.XPATH, "//div[@aria-label='Close']")
    close_button.click()
    print("Login prompt closed successfully...")
except:
    print("Login prompt not closed correctly...")
    pass

Login prompt not closed correctly...


In [None]:
# Scroll down to load all results
wait = WebDriverWait(browser, 10) # pause execution until condition is true w/ timotu limit

last_height = int(browser.execute_script("return document.body.scrollHeight"))
MIN_DELTA = 50  # minimum height change to count as "new content"

for i in range(MAX_SCROLLS):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.1)

    try:
        wait.until(
            lambda d: int(d.execute_script("return document.body.scrollHeight")) - last_height > MIN_DELTA
        )
        new_height = int(browser.execute_script("return document.body.scrollHeight"))
        print("Scroller successful:", new_height)
        last_height = new_height

    except TimeoutException:
        print("No further increase in scrollHeight. Stopping.")
        break

print("Finished Scrolling...")


Scroller successful: 5856


KeyboardInterrupt: 

In [None]:
# Retrieve the HTML
html = browser.page_source

soup = BeautifulSoup(html, 'html.parser')

browser.close()

In [None]:
# Find link elements
links = soup.find_all('a')

# Filter on product keyword only
product_links = [
    link for link in links
    if product.lower() in link.text.lower()
    #if city_link.lower() in link.text.lower()
]

product_data = []

for product_link in product_links:
    url = product_link.get('href')
    if not url:
        continue
    text = '\n'.join(product_link.stripped_strings)
    product_data.append({'url': url, 'text': text})

#print(len(product_links), "links matched")
#print(len(product_data), "items collected")

product_data

[{'url': '/marketplace/item/1936327400277652/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$800\nSelling a Gaming PC!\nOakland, CA'},
 {'url': '/marketplace/item/901321479028474/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$800\nGaming pc\nSan Francisco, CA'},
 {'url': '/marketplace/item/810831765100576/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$800\nGaming Pc\nSan Francisco, CA'},
 {'url': '/marketplace/item/4211399239137993/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$649\nGaming PC\nSan Francisco, CA'},
 {'url': '/marketplace/item/826444880106126/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$250\nDell OptiPlex Gaming PC\nSan Francisco, CA'},
 {'url': '/marketplace/item/2347540209049347/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': '$900\n$1,000\nGaming PC with a 3

In [None]:
# Create an empty list to store product data
extracted_data = []

for item in product_data:
    lines = item['text'].split('\n')

    # Regular expression to find numeric values
    numeric_pattern = re.compile('\d[\d,.]*')
    
    # Extracting prices
    # Iterate through lines to find the first line with numbers
    for line in lines:
        match = numeric_pattern.search(line)
        if match:    
            # Extract the first numeric value found
            price_str = match.group()
            # Convert price to float (handle commas)
            price = float(price_str.replace(',',''))
            break
    '''
    if price:
        print(f"Price extracted: {price}")
    else:
        print("price not found")
    '''
    # Extract title
    title = lines[-2]

    # Extract location
    location = lines[-1]

    # Add extracted data to a list of dictionaries
    extracted_data.append({
        'title': title,
        'price': price,
        'location': location,
        'url': re.sub(r'\?.*', '', item['url'])
    })

  numeric_pattern = re.compile('\d[\d,.]*')


In [None]:
extracted_data

[{'title': 'Selling a Gaming PC!',
  'price': 800.0,
  'location': 'Oakland, CA',
  'url': '/marketplace/item/1936327400277652/'},
 {'title': 'Gaming pc',
  'price': 800.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/901321479028474/'},
 {'title': 'Gaming Pc',
  'price': 800.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/810831765100576/'},
 {'title': 'Gaming PC',
  'price': 649.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/4211399239137993/'},
 {'title': 'Dell OptiPlex Gaming PC',
  'price': 250.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/826444880106126/'},
 {'title': 'Gaming PC with a 3070ti',
  'price': 900.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/2347540209049347/'},
 {'title': 'High-End Gaming PC – RTX 5070, i7-12700KF, 32GB DDR5, 3TB SSD, RGB',
  'price': 1900.0,
  'location': 'San Francisco, CA',
  'url': '/marketplace/item/1132546485748395/'},
 {'title': 'GAMING PC',
  'pri