Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Well the structure of everything changed, I changed it to take arguments from the user  via command line rather than  editting the file to receive the file. It ignores robots.txt,  It also now uses selenium stealth and  rotates a Good amount of user ages to gather this. Here's for being able to compile a free price history for you folk.
  • Loading branch information
kythrasuntamer authored Nov 16, 2024
1 parent 7d4cdfb commit 117e965
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions eBay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import csv
import time
import logging
import argparse
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium_stealth import stealth
import os

# Setup enhanced logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('scraper.log'), logging.StreamHandler()])

# User-Agent rotation
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0"
]

# Function to ensure complete loading of dynamic content
def scroll_and_wait(driver):
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # Wait for new content to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

# Function to scrape a single page using Selenium with enhanced waiting and scrolling
def scrape_page_with_selenium(driver, url):
driver.get(url)
time.sleep(5) # Initial wait for page load

# Scroll and wait for dynamic content
scroll_and_wait(driver)

# Save the page source for inspection
script_directory = os.path.dirname(os.path.abspath(__file__))
page_source_path = os.path.join(script_directory, "page_source.html")
with open(page_source_path, "w", encoding="utf-8") as file:
file.write(driver.page_source)
logging.info(f"Page source saved to '{page_source_path}' for inspection.")

try:
# Wait for elements to load
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 's-item')))
except Exception as e:
logging.warning(f"Error waiting for page elements: {e}")

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all('li', class_='s-item')

# Log the number of items found
logging.info(f"Number of items found: {len(items)}")
if not items:
logging.warning("No items found on the page. The page structure may have changed.")
return []

# Extract item details
page_results = []
for item in items:
title = item.find('div', class_='s-item__title')
price = item.find('span', class_='s-item__price')
shipping = item.find('span', class_='s-item__shipping')
condition = item.find('span', class_='SECONDARY_INFO')
listing_type = item.find('span', class_='s-item__purchase-options')

if title and hasattr(title, 'text') and price and hasattr(price, 'text'):
price_text = price.text.replace('$', '').replace(',', '').strip()
price_value = float(price_text.split()[0]) if price_text.replace('.', '', 1).isdigit() else 0.0

shipping_text = (shipping.text.replace('$', '').replace(',', '').strip()
if shipping else '0.00')
shipping_value = (
float(shipping_text.split()[0]) if 'Free' not in shipping_text and shipping_text.replace('.', '', 1).isdigit()
else 0.0
)

total_cost = price_value + shipping_value

page_results.append({
'Title': title.text.strip() if title else 'N/A',
'Price': f"${price_value:.2f}",
'Shipping Cost': f"${shipping_value:.2f}",
'Total Cost': f"${total_cost:.2f}",
'Condition': condition.text.strip() if condition else 'N/A',
'Listing Type': listing_type.text.strip() if listing_type else 'N/A'
})

return page_results

# Main function to search for an item, scrape eBay prices, and save to CSV using Selenium with stealth integration
def scrape_ebay_prices_to_csv(search_query, output_file='ebay_prices.csv', delay=1, max_pages=1):
base_url = "https://www.ebay.com/sch/i.html?_nkw={}&_pgn={}"
search_query_formatted = search_query.replace(' ', '+')

# Setup Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={random.choice(user_agents)}") # Rotate User-Agent

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Apply stealth settings to the driver
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)

results = []
for page in range(1, max_pages + 1):
url = base_url.format(search_query_formatted, page)
logging.info(f"Scraping page {page}...")
page_results = scrape_page_with_selenium(driver, url)
if page_results:
results.extend(page_results)
time.sleep(random.uniform(delay, delay + 5)) # Randomized delay

driver.quit()

# Save the results to a CSV file
if results:
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Title', 'Price', 'Shipping Cost', 'Total Cost', 'Condition', 'Listing Type']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

writer.writeheader()
for result in results:
writer.writerow(result)

logging.info(f"Data has been saved to {output_file}")
else:
logging.warning("No data to save. Check 'page_source.html' for the page structure.")

# Main block to handle command-line arguments
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Search eBay for an item and save details to a CSV file using Selenium with stealth integration.")
parser.add_argument('search_query', type=str, help='The search query for the item on eBay')
parser.add_argument('--output_file', type=str, default='ebay_prices.csv', help='Output CSV file name (default: ebay_prices.csv)')
parser.add_argument('--delay', type=float, default=1, help='Delay between requests in seconds (default: 1)')
parser.add_argument('--max_pages', type=int, default=1, help='Number of pages to scrape (default: 1)')

args = parser.parse_args()

# Run the scraper with provided arguments
scrape_ebay_prices_to_csv(args.search_query, args.output_file, args.delay, args.max_pages)

0 comments on commit 117e965

Please sign in to comment.