In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [15]:
def extract_listing_data_autotrader(listing):
    # Extract the individual elements from each listing based on AutoTrader's structure
    ad_link_element = listing.find('a', {'class': 'inner-link'})
    title_element = listing.find('span', {'class': 'title-with-trim'})
    price_element = listing.find('span', {'class': 'price-amount'})
    mileage_element = listing.find('span', {'class': 'odometer-proximity'})
    location_element = listing.find('span', {'class': 'proximity-text'})
    dealer_element = listing.find('div', {'class': 'seller-name'})

    # Extract the description if available
    description_element = listing.find('p', {'class': 'details used'})  # This may need adjustment if not matching

    # Populate the dictionary with extracted values for the current listing
    extracted_data = {
        'ad_link': 'https://www.autotrader.ca' + ad_link_element['href'] if ad_link_element else None,
        'title': title_element.text.strip() if title_element else None,
        'price': price_element.text.strip() if price_element else None,
        'description': description_element.text.strip() if description_element else None,
        'mileage': mileage_element.text.strip() if mileage_element else None,
        'location': location_element.text.strip() if location_element else None,
        'dealer': dealer_element.text.strip() if dealer_element else None,
    }

    return extracted_data

In [16]:
def scrape_autotrader_cars(base_url, total_pages, results_per_page):
    all_data = []
    session = requests.Session()

    # Define headers to better simulate a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Referer': 'https://www.autotrader.ca/',
        'Upgrade-Insecure-Requests': '1',
        'DNT': '1',  # Do Not Track request header
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
    }

    for page_number in range(total_pages):
        offset = page_number * results_per_page
        url = f"{base_url}&rcs={offset}"
        print(f"Scraping URL: {url}")

        # Make the HTTP request with headers and session to maintain cookies
        response = session.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # Find all elements that contain individual listings
            listings = soup.find_all('div', {'class': 'dealer-split-wrapper'})

            # Loop through each listing and extract data
            for listing in listings:
                data = extract_listing_data_autotrader(listing)
                all_data.append(data)

        else:
            print(f"Failed to retrieve page {page_number + 1}. Status code: {response.status_code}")

        # Pause for 10 seconds to avoid overwhelming the server
        time.sleep(10)

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    csv_file_path = 'autotrader_car_listings.csv'
    df.to_csv(csv_file_path, index=False)

    return csv_file_path

In [17]:
# usage
base_url = "https://www.autotrader.ca/cars/?rcp=15&rcs=0&srt=35&prx=-1&loc=l4e%204z6&fuel=Electric&hprc=True&wcp=True&inMarket=advancedSearch"
total_pages = 1  # Test with a smaller number first
results_per_page = 15
csv_path = scrape_autotrader_cars(base_url, total_pages, results_per_page)
csv_path

Scraping URL: https://www.autotrader.ca/cars/?rcp=15&rcs=0&srt=35&prx=-1&loc=l4e%204z6&fuel=Electric&hprc=True&wcp=True&inMarket=advancedSearch&rcs=0


'autotrader_car_listings.csv'