different strategy idea for getting listing urls in this notebook:

This page has every craigslist website location: https://geo.craigslist.org/iso/us

loop through all these with base query: <city>.craigslist.org/search/cta?bundleDuplicates=1&postedToday=1&purveyor=owner#search=1~gallery~0~0 (this is filtering to what was posted today and being sold by owner, but if we are running constantly this seems like the way?)

within each loop, loop through all gallery~x~ to get all listings from pages after first page for that city

create dictionary with url list as value per city key

create a dictionary with df as value per city key

I guess only real benefit of this approach is not having to worry about crafting search queries with make and model?



First cell gets a list of all craigslist cities and sets a base_query, search filters are part of URL, can customize further

In [52]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# URL for all Craigslist locations
base_url = "https://geo.craigslist.org/iso/us"

# Fetch the page with all Craigslist locations
response = requests.get(base_url)

# Parse the page with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the container with all city links
container = soup.find('div', class_='geo-site-list-container')

# Find all <a> elements within the container and extract hrefs (which should be URLs for Craigslist websites)
city_urls = [element["href"] for element in container.find_all("a", href=True)]

print(city_urls)

# Query to append to each city URL
base_query = "/search/cta?bundleDuplicates=1&postedToday=1&purveyor=owner#search=1~gallery~0~0"


['https://abilene.craigslist.org', 'https://akroncanton.craigslist.org', ... 'https://yubasutter.craigslist.org', 'https://yuma.craigslist.org', 'https://zanesville.craigslist.org']


Second cell defines scrape_pages function which uses selenium and loops through all pages of a city+base query combo, and adds all listing urls to a dictionary that would eventually include all cities

In [None]:
def scrape_pages(base_url):
    i = 0
    all_urls = []

    # Extract the city name from the base_url
    city_name = base_url.split("//")[1].split(".")[0]

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode

    # Set up the webdriver
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
    
    while True:
        try:
            # Append the page number to the URL
            url = base_url + "#search=1~gallery~" + str(i) + "~0"
            # Fetch the page
            driver.get(url)
            # Parse the page with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")

            # Find all <a> elements with class="titlestring" and an href attribute
            elements = soup.find_all("a", class_="titlestring", href=True)

            # Extract the href attribute from each element and store them in the "urls" list
            # Ignores ones that are in the 'search wider area' section of page by checking for city name
            urls = [element["href"] for element in elements if city_name in element["href"]]

            # Check if the newly scraped page has any new URLs
            # This is because if you go beyond the limit of search results, it redirects to last page with actual results
            if not urls or (set(urls).issubset(set(all_urls))):
                break  # If not, break the loop

            #Im betting these two tests are very very inefficient and should be revisited

            # Extend the all_urls list with the URLs from this page
            all_urls.extend(urls)

            # Increment the page number
            i += 1

            # Sleep for a short period to avoid making too many requests in a short period of time
            time.sleep(1)

        except Exception as e:
            print(f"An error occurred: {e}")
            break

    # Quit the driver
    driver.quit()

    return all_urls


# Dictionary to hold city names and corresponding URLs
city_url_dict = {}

# Loop through all Craigslist website URLs
for city_url in city_urls:
    # Extract the city name from the URL
    city_name = city_url.split("//")[1].split(".")[0]

    # Append the base query to the city URL
    base_query = city_url + "/search/cta?bundleDuplicates=1&postedToday=1&purveyor=owner"
    
    # Call the scrape_pages function and store the results in the dictionary
    city_url_dict[city_name] = scrape_pages(base_query)

# Print the dictionary
for city, urls in city_url_dict.items():
    print(f"{city}: {urls}")


From here it is just code parsing listing page html mostly, this cell defines some functions for parsing different sections

In [54]:
def parse_name(name):
    year_pattern = r'\b\d{4}\b'
    make_model_pattern = r'\b([A-Za-z]+)\s+([A-Za-z0-9-]+)'
    
    year_match = re.search(year_pattern, name)
    make_model_match = re.search(make_model_pattern, name)

    year = year_match.group(0) if year_match else None
    make, model = make_model_match.groups() if make_model_match else (None, None)

    return make, model, year

def parse_attrgroup(soup):    
    car_name = soup.find('p', class_='attrgroup')
    if car_name is not None:
        car_name = car_name.find('b').text.strip()
    # Extract latitude and longitude
    map_div = soup.find('div', {'id': 'map'})
    latitude = float(map_div['data-latitude'])
    longitude = float(map_div['data-longitude'])

    # Extract attributes
    attrgroup = soup.find_all('p', class_='attrgroup')
    attributes = {}
    for group in attrgroup:
        for span in group.find_all('span'):
            if ':' in span.text:
                key, value = span.text.split(':')
                attributes[key.strip()] = value.strip()

    # Extract make, model, and year
    make, model, year = parse_name(car_name)  # Pass the car_name variable to the parse_name function

    parsed_data = {
        'Title Status': attributes.get('title status'),
        'Paint Color': attributes.get('paint color'),
        'Odometer': int(attributes.get('odometer')),
        'Drive': attributes.get('drive'),
        'Condition': attributes.get('condition'),
        'Make': make,
        'Model': model,
        'Year': year
    }

    return parsed_data

def parse_ld_posting_data(soup):
    script_tag = soup.find('script', {'id': 'ld_posting_data'})
    json_data = json.loads(script_tag.string)
    description = json_data['description']
    price = json_data['offers']['price']
    latitude = json_data['offers']['availableAtOrFrom']['geo']['latitude']
    longitude = json_data['offers']['availableAtOrFrom']['geo']['longitude']

    parsed_listing = {
        'Price': price,
        'Description': description,
        'Latitude': latitude,
        'Longitude': longitude
    }
    return parsed_listing


This cell calls the parsing function on the massive list of urls, organizing resulting dfs into a dictionary with city names as keys

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re


# Create an empty DataFrame to store the extracted data
columns = ['Make', 'Model', 'Year', 'Miles', 'Price', 'Title', 'Paint', 'Drive', 'Condition', 'Description', 'Latitude', 'Longitude']
df = pd.DataFrame(columns=columns)

# Create an empty dictionary to hold DataFrames for each city
city_df_dict = {}

for city, urls in city_url_dict.items():
    # Initialize an empty DataFrame for each city
    df = pd.DataFrame(columns=['Make', 'Model', 'Year', 'Miles', 'Price', 'Title', 'Paint', 'Drive', 'Condition', 'Description', 'Latitude', 'Longitude'])

    for url in urls:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            script_tag = soup.find('script', {'id': 'ld_posting_data'})

            if script_tag is None:
                print(f"Skipping URL {url} - script_tag not found")
                continue

            json_data = json.loads(script_tag.string)

            # Extract relevant fields via dictionaries which are outputs from functions above
            parsed_data = parse_attrgroup(soup)
            parsed_listing = parse_ld_posting_data(soup)

            # Append the extracted data to the DataFrame
            new_row = pd.DataFrame({
                'Make': [parsed_data['Make']],
                'Model': [parsed_data['Model']],
                'Year': [parsed_data['Year']],
                'Miles': [parsed_data['Odometer']],
                'Price': [parsed_listing['Price']],
                'Title': [parsed_data['Title Status']],
                'Paint': [parsed_data['Paint Color']],
                'Drive': [parsed_data['Drive']],
                'Condition': [parsed_data['Condition']],
                'Description': [parsed_listing['Description']],
                'Latitude': [parsed_listing['Latitude']],
                'Longitude': [parsed_listing['Longitude']],
            })

            df = pd.concat([df, new_row], ignore_index=True)
            
        except AttributeError as e:
            print(f"Skipping URL {url} - Error: {e}")

    # Save the DataFrame for this city to the dictionary
    city_df_dict[city] = df
