-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Well the structure of everything changed, I changed it to take arguments from the user via command line rather than editting the file to receive the file. It ignores robots.txt, It also now uses selenium stealth and rotates a Good amount of user ages to gather this. Here's for being able to compile a free price history for you folk.
- Loading branch information
1 parent
7d4cdfb
commit 117e965
Showing
1 changed file
with
165 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import csv | ||
import time | ||
import logging | ||
import argparse | ||
import random | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.chrome.service import Service | ||
from selenium.webdriver.chrome.options import Options | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from bs4 import BeautifulSoup | ||
from selenium_stealth import stealth | ||
import os | ||
|
||
# Setup enhanced logging configuration | ||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('scraper.log'), logging.StreamHandler()]) | ||
|
||
# User-Agent rotation | ||
user_agents = [ | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36", | ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0" | ||
] | ||
|
||
# Function to ensure complete loading of dynamic content | ||
def scroll_and_wait(driver): | ||
last_height = driver.execute_script("return document.body.scrollHeight") | ||
while True: | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
time.sleep(5) # Wait for new content to load | ||
new_height = driver.execute_script("return document.body.scrollHeight") | ||
if new_height == last_height: | ||
break | ||
last_height = new_height | ||
|
||
# Function to scrape a single page using Selenium with enhanced waiting and scrolling | ||
def scrape_page_with_selenium(driver, url): | ||
driver.get(url) | ||
time.sleep(5) # Initial wait for page load | ||
|
||
# Scroll and wait for dynamic content | ||
scroll_and_wait(driver) | ||
|
||
# Save the page source for inspection | ||
script_directory = os.path.dirname(os.path.abspath(__file__)) | ||
page_source_path = os.path.join(script_directory, "page_source.html") | ||
with open(page_source_path, "w", encoding="utf-8") as file: | ||
file.write(driver.page_source) | ||
logging.info(f"Page source saved to '{page_source_path}' for inspection.") | ||
|
||
try: | ||
# Wait for elements to load | ||
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 's-item'))) | ||
except Exception as e: | ||
logging.warning(f"Error waiting for page elements: {e}") | ||
|
||
# Parse the page source with BeautifulSoup | ||
soup = BeautifulSoup(driver.page_source, 'html.parser') | ||
items = soup.find_all('li', class_='s-item') | ||
|
||
# Log the number of items found | ||
logging.info(f"Number of items found: {len(items)}") | ||
if not items: | ||
logging.warning("No items found on the page. The page structure may have changed.") | ||
return [] | ||
|
||
# Extract item details | ||
page_results = [] | ||
for item in items: | ||
title = item.find('div', class_='s-item__title') | ||
price = item.find('span', class_='s-item__price') | ||
shipping = item.find('span', class_='s-item__shipping') | ||
condition = item.find('span', class_='SECONDARY_INFO') | ||
listing_type = item.find('span', class_='s-item__purchase-options') | ||
|
||
if title and hasattr(title, 'text') and price and hasattr(price, 'text'): | ||
price_text = price.text.replace('$', '').replace(',', '').strip() | ||
price_value = float(price_text.split()[0]) if price_text.replace('.', '', 1).isdigit() else 0.0 | ||
|
||
shipping_text = (shipping.text.replace('$', '').replace(',', '').strip() | ||
if shipping else '0.00') | ||
shipping_value = ( | ||
float(shipping_text.split()[0]) if 'Free' not in shipping_text and shipping_text.replace('.', '', 1).isdigit() | ||
else 0.0 | ||
) | ||
|
||
total_cost = price_value + shipping_value | ||
|
||
page_results.append({ | ||
'Title': title.text.strip() if title else 'N/A', | ||
'Price': f"${price_value:.2f}", | ||
'Shipping Cost': f"${shipping_value:.2f}", | ||
'Total Cost': f"${total_cost:.2f}", | ||
'Condition': condition.text.strip() if condition else 'N/A', | ||
'Listing Type': listing_type.text.strip() if listing_type else 'N/A' | ||
}) | ||
|
||
return page_results | ||
|
||
# Main function to search for an item, scrape eBay prices, and save to CSV using Selenium with stealth integration | ||
def scrape_ebay_prices_to_csv(search_query, output_file='ebay_prices.csv', delay=1, max_pages=1): | ||
base_url = "https://www.ebay.com/sch/i.html?_nkw={}&_pgn={}" | ||
search_query_formatted = search_query.replace(' ', '+') | ||
|
||
# Setup Chrome options for headless mode | ||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") # Run in headless mode | ||
chrome_options.add_argument("--disable-gpu") | ||
chrome_options.add_argument("--window-size=1920x1080") | ||
chrome_options.add_argument("--no-sandbox") | ||
chrome_options.add_argument("--disable-dev-shm-usage") | ||
chrome_options.add_argument(f"user-agent={random.choice(user_agents)}") # Rotate User-Agent | ||
|
||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) | ||
|
||
# Apply stealth settings to the driver | ||
stealth(driver, | ||
languages=["en-US", "en"], | ||
vendor="Google Inc.", | ||
platform="Win32", | ||
webgl_vendor="Intel Inc.", | ||
renderer="Intel Iris OpenGL Engine", | ||
fix_hairline=True, | ||
) | ||
|
||
results = [] | ||
for page in range(1, max_pages + 1): | ||
url = base_url.format(search_query_formatted, page) | ||
logging.info(f"Scraping page {page}...") | ||
page_results = scrape_page_with_selenium(driver, url) | ||
if page_results: | ||
results.extend(page_results) | ||
time.sleep(random.uniform(delay, delay + 5)) # Randomized delay | ||
|
||
driver.quit() | ||
|
||
# Save the results to a CSV file | ||
if results: | ||
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: | ||
fieldnames = ['Title', 'Price', 'Shipping Cost', 'Total Cost', 'Condition', 'Listing Type'] | ||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | ||
|
||
writer.writeheader() | ||
for result in results: | ||
writer.writerow(result) | ||
|
||
logging.info(f"Data has been saved to {output_file}") | ||
else: | ||
logging.warning("No data to save. Check 'page_source.html' for the page structure.") | ||
|
||
# Main block to handle command-line arguments | ||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Search eBay for an item and save details to a CSV file using Selenium with stealth integration.") | ||
parser.add_argument('search_query', type=str, help='The search query for the item on eBay') | ||
parser.add_argument('--output_file', type=str, default='ebay_prices.csv', help='Output CSV file name (default: ebay_prices.csv)') | ||
parser.add_argument('--delay', type=float, default=1, help='Delay between requests in seconds (default: 1)') | ||
parser.add_argument('--max_pages', type=int, default=1, help='Number of pages to scrape (default: 1)') | ||
|
||
args = parser.parse_args() | ||
|
||
# Run the scraper with provided arguments | ||
scrape_ebay_prices_to_csv(args.search_query, args.output_file, args.delay, args.max_pages) |