# Web Scraping for Washington State Demographic Data

In [1]:
# Loads all packages necessary for this script
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# Stored URLS for the different Washington State Counties 
urls = ['https://www.census.gov/quickfacts/fact/table/adamscountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/asotincountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/bentoncountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/chelancountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/clarkcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/columbiacountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/cowlitzcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/douglascountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/ferrycountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/franklincountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/garfieldcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/grantcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/islandcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/jeffersoncountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/kingcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/kitsapcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/kittitascountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/klickitatcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/lewiscountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/lincolncountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/masoncountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/okanogancountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/pacificcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/pendoreillecountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/piercecountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/sanjuancountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/skagitcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/skamaniacountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/snohomishcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/spokanecountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/stevenscountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/thurstoncountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/wahkiakumcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/wallawallacountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/whatcomcountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/whitmancountywashington/PST045217',
       'https://www.census.gov/quickfacts/fact/table/yakimacountywashington/PST045217']

In [3]:
demo_info = []

# Loops through all urls to find demographic data
for x in range(len(urls)):
    current_url = urls[x]
    current_site = requests.get(current_url)
    
    # Uses BeautifulSoup pagcakge for html parsing
    current_page = BeautifulSoup(current_site.text, 'html.parser')
    current_titles = current_page.find_all('div', attrs = {'class':'qf-titlebar'})

    # Gathers the titles for the County 
    for current_title in current_titles:
        current_titles = current_title.find('h2').text
        
    # Finds the population totals for each county 
    population_totals = current_page.find_all('tr', attrs = {'class':'fact selected'})
    for population_total in population_totals:
        population_totals = population_total.contents[3].text
    
    # Finds the different race compositions of each county (along with percent makeup)
    race_breakdowns = current_page.find_all()
    race_breakdowns = current_page.find_all('tbody', attrs = {'data-topic' : 'Race and Hispanic Origin'})
    for race_breakdown in race_breakdowns:
        hopes = race_breakdown.find_all('tr', attrs = {'data-unit':'PCT'})
        for hope in hopes:
            race = hope.find('a')['data-title']
            percent = hope.contents[3].text
            demo_info.append((current_titles, population_totals, race, percent))

In [5]:
# Creates dataframe from demographic info list
demo_df = pd.DataFrame(demo_info, 
                              columns = ['county', 'population', 'race', 'percentage'])

# Exports dataframe to CSV file
demo_df.to_csv('demographics_rawData.csv', 
                      index = False)