In [101]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import urllib.parse
import undetected_chromedriver as uc

import os
import time
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests

In [102]:
# Configure Chrome Webdriver

chrome_install = ChromeDriverManager().install()

In [103]:
# Initialize Chrome WebDriver
options = webdriver.ChromeOptions()
#options.add_argument('--headless')

options.add_argument(
    "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
)

browser = webdriver.Chrome(options=options)

In [104]:
# Setup search parameters
city = "montreal"
product = "Gaming PC"
min_price = 0
max_price = 5000
days_listed = 7
MAX_SCROLLS = 4

In [105]:
# Setup base URL
url = f'https://www.facebook.com/marketplace/{city}/search?query={product}&minPrice={min_price}&maxPrice={max_price}&daysSinceListed={days_listed}&sortBy=creation_time_descend&exact=false'

# Visit website
browser.get(url)

# Close login pop-up
try:
    close_button = browser.find_element(By.XPATH, "//div[@aria-label='Close']")
    close_button.click()
    print("Login prompt closed successfully...")
except:
    print("Login prompt not closed correctly...")
    pass

Login prompt not closed correctly...


In [106]:
# Scroll down to load all results
wait = WebDriverWait(browser, 10) # pause execution until condition is true w/ timotu limit

last_height = int(browser.execute_script("return document.body.scrollHeight"))
MIN_DELTA = 50  # minimum height change to count as "new content"

for i in range(MAX_SCROLLS):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.1)

    try:
        wait.until(
            lambda d: int(d.execute_script("return document.body.scrollHeight")) - last_height > MIN_DELTA
        )
        new_height = int(browser.execute_script("return document.body.scrollHeight"))
        print("Scroller successful:", new_height)
        last_height = new_height

    except TimeoutException:
        print("No further increase in scrollHeight. Stopping.")
        break

print("Finished Scrolling...")


Scroller successful: 5848
Scroller successful: 8839
Scroller successful: 11789
Scroller successful: 14800
Finished Scrolling...


In [107]:
# Retrieve the HTML
html = browser.page_source

soup = BeautifulSoup(html, 'html.parser')

browser.close()

In [None]:
# Find link elements
links = soup.find_all('a')

# Filter on product keyword only
product_links = [
    link for link in links
    if product.lower() in link.text.lower()
    #if city_link.lower() in link.text.lower()
]

product_data = []

for product_link in product_links:
    url = product_link.get('href')
    if not url:
        continue
    text = '\n'.join(product_link.stripped_strings)
    product_data.append({'url': url, 'text': text})

[{'url': '/marketplace/item/870259832329433/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': 'CA$469\nBudget Gaming PC with Master Roshi\nMontréal, QC'},
 {'url': '/marketplace/item/1352520299338565/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': 'CA$650\nGaming pc\nMont-Royal, QC'},
 {'url': '/marketplace/item/825359173598814/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': 'CA$710\nRTX 2080Ti/ i7-7700 Gaming PC\nMontréal, QC'},
 {'url': '/marketplace/item/1991714498056219/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': 'CA$400\nEntry level gaming pc\nMontréal, QC'},
 {'url': '/marketplace/item/2070007520494657/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
  'text': 'CA$419\nBudget Gaming PC with Goku\nMontréal, QC'},
 {'url': '/marketplace/item/730707916057808/?ref=search&referral_code=null&referral_story_type=post&__tn__=!%3AD',
 

In [None]:
# Create an empty list to store product data
extracted_data = []

for item in product_data:
    lines = item['text'].split('\n')

    # Regular expression to find numeric values
    numeric_pattern = re.compile('\d[\d,.]*')
    
    # Extracting prices
    # Iterate through lines to find the first line with numbers
    for line in lines:
        match = numeric_pattern.search(line)
        if match:    
            # Extract the first numeric value found
            price_str = match.group()
            # Convert price to float (handle commas)
            price = float(price_str.replace(',',''))
            break
        
    # Extract title
    title = lines[-2]

    # Extract location
    location = lines[-1]

    # Add extracted data to a list of dictionaries
    extracted_data.append({
        'title': title,
        'price': price,
        'location': location,
        'url': re.sub(r'\?.*', '', item['url'])
    })

  numeric_pattern = re.compile('\d[\d,.]*')


In [110]:
extracted_data

[{'title': 'Budget Gaming PC with Master Roshi',
  'price': 469.0,
  'location': 'Montréal, QC',
  'url': '/marketplace/item/870259832329433/'},
 {'title': 'Gaming pc',
  'price': 650.0,
  'location': 'Mont-Royal, QC',
  'url': '/marketplace/item/1352520299338565/'},
 {'title': 'RTX 2080Ti/ i7-7700 Gaming PC',
  'price': 710.0,
  'location': 'Montréal, QC',
  'url': '/marketplace/item/825359173598814/'},
 {'title': 'Entry level gaming pc',
  'price': 400.0,
  'location': 'Montréal, QC',
  'url': '/marketplace/item/1991714498056219/'},
 {'title': 'Budget Gaming PC with Goku',
  'price': 419.0,
  'location': 'Montréal, QC',
  'url': '/marketplace/item/2070007520494657/'},
 {'title': 'GAMING PC (VENTE RAPIDE/NEGOCIABLE)',
  'price': 400.0,
  'location': 'Montréal, QC',
  'url': '/marketplace/item/730707916057808/'},
 {'title': 'Gaming PC',
  'price': 550.0,
  'location': 'Montréal-Ouest, QC',
  'url': '/marketplace/item/1698603451524564/'},
 {'title': 'Gaming Pc 5600x / 3070 Ti',
  'price