In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
main_request = requests.get("https://en.wikipedia.org/wiki/County_(United_States)")
main_wiki = BeautifulSoup(main_request.text)

In [3]:
states = main_wiki.select('.wikitable.sortable tbody tr td:first-child .flagicon ~ a')

In [4]:
county_states = []
counties = []
county_populations = []
county_land_areas_km = []

county_name_pattern = re.compile(r"(.*) (County|Borough|Parish)")
land_area_pattern = re.compile(r"\((\d+,?\d+).*km.*\)")

for state in states[:51]:
    state_name = state.text
    
    if state_name in ['District of Columbia']:
        continue
    
    state_link = "https://en.wikipedia.org" + state.attrs['href']
    
    state_request = requests.get(state_link)
    state_page = BeautifulSoup(state_request.text)
    county_rows = state_page.select('.wikitable.sortable')[0].select('tbody tr')
    
    for county in county_rows[1:]:
        name = county.select('th')[0].text.replace("\n", "")
        
        if len(county_name_pattern.findall(name)) > 0:
            name = county_name_pattern.findall(name)[0][0]
        
        cells = county.select('td')
        population = int(''.join(cells[-3].text.split(',')))
        land_area = int(''.join(land_area_pattern.findall(cells[-2].text)[0].split(',')))
    
        county_states.append(state_name)
        counties.append(name)
        county_populations.append(population)
        county_land_areas_km.append(land_area)

In [5]:
county_df = pd.DataFrame({
    'state': county_states,
    'county': counties,
    'population': county_populations,
    'land_area_km': county_land_areas_km
})

In [6]:
county_df['density_km'] = round(county_df['population']/county_df['land_area_km'], 3)

In [7]:
county_df.to_csv('./relevant_data/county-population.csv', index=False)