# Census Data Import and Cleaning
This notebook uses a Selenium script to scrape census statistics by county for all counties in US states.

## Import county code list

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_counties = pd.read_csv('data/us_county_codes.csv')

In [3]:
df_counties.tail()

Unnamed: 0,FIPS,Name,State
3227,72151,Yabucoa,PR
3228,72153,Yauco,PR
3229,78010,St. Croix,VI
3230,78020,St. John,VI
3231,78030,St. Thomas,VI


In [4]:
df_counties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3232 entries, 0 to 3231
Data columns (total 3 columns):
FIPS     3232 non-null int64
Name     3232 non-null object
State    3232 non-null object
dtypes: int64(1), object(2)
memory usage: 75.8+ KB


In [5]:
df_counties['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'AS', 'MP', 'GU', 'PR',
       'VI'], dtype=object)

In [6]:
# remove US territory data
df_counties = df_counties[(df_counties['State'] != 'AS') & (df_counties['State'] != 'MP') 
            & (df_counties['State'] != 'GU') & (df_counties['State'] != 'PR') & (df_counties['State'] != 'VI')]            

In [7]:
df_counties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 0 to 3141
Data columns (total 3 columns):
FIPS     3142 non-null int64
Name     3142 non-null object
State    3142 non-null object
dtypes: int64(1), object(2)
memory usage: 98.2+ KB


In [8]:
df_counties.tail()

Unnamed: 0,FIPS,Name,State
3137,56037,Sweetwater,WY
3138,56039,Teton,WY
3139,56041,Uinta,WY
3140,56043,Washakie,WY
3141,56045,Weston,WY


In [9]:
df_counties['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [10]:
# create dictionary of state codes and names

state_name_list = {'AL':'Alabama', 'AK':'Alaska', 'AZ':'Arizona', 'AR':'Arkansas', 
                   'CA':'California', 'CO':'Colorado', 'CT':'Connecticut', 'DE':'Delaware', 
                   'DC':'District of Columbia', 'FL':'Florida', 'GA':'Georgia',
                   'HI':'Hawaii', 'ID':'Idaho', 'IL':'Illinois', 'IN':'Indiana', 'IA':'Iowa', 
                   'KS':'Kansas', 'KY':'Kentucky', 'LA':'Louisiana', 'ME':'Maine', 'MD':'Maryland', 
                   'MA':'Massachusetts', 'MI':'Michigan', 'MN':'Minnesota', 'MS':'Mississippi', 
                   'MO':'Missouri', 'MT':'Montana', 'NE':'Nebraska', 'NV':'Nevada', 'NH':'New Hampshire', 
                   'NJ':'New Jersey', 'NM':'New Mexico', 'NY':'New York', 'NC':'North Carolina', 
                   'ND':'North Dakota', 'OH':'Ohio', 'OK':'Oklahoma', 'OR':'Oregon', 'PA':'Pennsylvania', 
                   'RI':'Rhode Island', 'SC':'South Carolina', 'SD':'South Dakota', 'TN':'Tennessee', 'TX':'Texas', 
                   'UT':'Utah', 'VT':'Vermont', 'VA':'Virginia', 'WA':'Washington', 'WV':'West Virginia', 
                   'WI':'Wisconsin', 'WY':'Wyoming'}

In [11]:
# create function to assign state name for state code input
def state_name(state_code):
    return state_name_list[state_code]

In [12]:
# remove trailing whitespace from column titles
df_counties.columns = df_counties.columns.str.strip()

In [13]:
# create new column for call in correct format for scraping ACS data from website
df_counties['Call'] = df_counties['Name'].map(str) + 'County, ' + df_counties['State'].apply(state_name)

In [14]:
df_counties.head()

Unnamed: 0,FIPS,Name,State,Call
0,1001,Autauga,AL,"Autauga County, Alabama"
1,1003,Baldwin,AL,"Baldwin County, Alabama"
2,1005,Barbour,AL,"Barbour County, Alabama"
3,1007,Bibb,AL,"Bibb County, Alabama"
4,1009,Blount,AL,"Blount County, Alabama"


## Scrape ACS data

In [15]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get("http://www.statsamerica.org/USCP/")

time.sleep(6)


In [16]:
# collect data for all counties in counties dataframe

# ID items
state = []
county_num = []
county_name = []

# items to collect
county_name_re = []
population = []
growth = []
households = []
labor = []
unemployment = []
per_capita = []
median_income = []
poverty = []
high_school = []
college = []

# for loop to iterate through every county
for index in df_counties.index:
    
    # enter formatted county name in search bar
    driver.refresh()
    query = driver.find_element_by_id("zt")
    query.send_keys(df_counties.iloc[index, 3])
    time.sleep(1)
    query.send_keys(Keys.DOWN)
    query.send_keys(Keys.RETURN)
    time.sleep(2)
    
    try:
        
        # retrieve data from populated table for county and append to lists
        county_name_re.append(driver.find_element_by_xpath('//h2[@class="no_gap"]').text)
        population.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Population")]/following-sibling::td').text)
        growth.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Growth")]/following-sibling::td').text)
        households.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Households")]/following-sibling::td').text)
        labor.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Labor Force")]/following-sibling::td').text)
        unemployment.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Unemployment Rate")]/following-sibling::td').text)
        per_capita.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Per Capita")]/following-sibling::td').text)
        median_income.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Median")]/following-sibling::td').text)
        poverty.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Poverty")]/following-sibling::td').text)
        high_school.append(driver.find_element_by_xpath('//tr/td[contains(text(), "High School")]/following-sibling::td').text)
        college.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Bachelor")]/following-sibling::td').text)

        # populate ID data for county
        state.append(df_counties.iloc[index, 2])
        county_num.append(df_counties.iloc[index, 0])
        county_name.append(df_counties.iloc[index, 1])
        
    # exception case to handle errors if county not found
    except:
        
        time.sleep(3)
    
        try:

            # retrieve data from populated table for county and append to lists
            county_name_re.append(driver.find_element_by_xpath('//h2[@class="no_gap"]').text)
            population.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Population")]/following-sibling::td').text)
            growth.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Growth")]/following-sibling::td').text)
            households.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Households")]/following-sibling::td').text)
            labor.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Labor Force")]/following-sibling::td').text)
            unemployment.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Unemployment Rate")]/following-sibling::td').text)
            per_capita.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Per Capita")]/following-sibling::td').text)
            median_income.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Median")]/following-sibling::td').text)
            poverty.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Poverty")]/following-sibling::td').text)
            high_school.append(driver.find_element_by_xpath('//tr/td[contains(text(), "High School")]/following-sibling::td').text)
            college.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Bachelor")]/following-sibling::td').text)

            # populate ID data for county
            state.append(df_counties.iloc[index, 2])
            county_num.append(df_counties.iloc[index, 0])
            county_name.append(df_counties.iloc[index, 1])
        
        
        except:
            
            try:
                
                # enter formatted county name in search bar
                driver.refresh()
                query = driver.find_element_by_id("zt")
                query.send_keys(df_counties.iloc[index, 1])
                time.sleep(1)
                query.send_keys(Keys.DOWN)
                query.send_keys(Keys.RETURN)
                time.sleep(4)
                
                # retrieve data from populated table for county and append to lists
                county_name_re.append(driver.find_element_by_xpath('//h2[@class="no_gap"]').text)
                population.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Population")]/following-sibling::td').text)
                growth.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Growth")]/following-sibling::td').text)
                households.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Households")]/following-sibling::td').text)
                labor.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Labor Force")]/following-sibling::td').text)
                unemployment.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Unemployment Rate")]/following-sibling::td').text)
                per_capita.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Per Capita")]/following-sibling::td').text)
                median_income.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Median")]/following-sibling::td').text)
                poverty.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Poverty")]/following-sibling::td').text)
                high_school.append(driver.find_element_by_xpath('//tr/td[contains(text(), "High School")]/following-sibling::td').text)
                college.append(driver.find_element_by_xpath('//tr/td[contains(text(), "Bachelor")]/following-sibling::td').text)

                # populate ID data for county
                state.append(df_counties.iloc[index, 2])
                county_num.append(df_counties.iloc[index, 0])
                county_name.append(df_counties.iloc[index, 1])                
                
            except:
                
                # fill in variables as "Error!"
                county_name_re.append('Error!')
                population.append('Error!')
                growth.append('Error!')
                households.append('Error!')
                labor.append('Error!')
                unemployment.append('Error!')
                per_capita.append('Error!')
                median_income.append('Error!')
                poverty.append('Error!')
                high_school.append('Error!')
                college.append('Error!')

                state.append(df_counties.iloc[index, 2])
                county_num.append(df_counties.iloc[index, 0])
                county_name.append(df_counties.iloc[index, 1])

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=66.0.3359.139)
  (Driver info: chromedriver=2.36.540469 (1881fd7f8641508feb5166b7cae561d87723cfa8),platform=Mac OS X 10.13.4 x86_64)


In [17]:
# build county data library from scraped data

df_county_data = pd.DataFrame({
        'number':county_num,
        'name':county_name,
        'state':state,
        'combine':county_name_re,
        'total_population':population,
        'growth_rate':growth,
        'num_households':households,
        'labor_force':labor,
        'unemployment_rate':unemployment,
        'per_capita_income':per_capita,
        'median_hh_income':median_income,
        'poverty_rate':poverty,
        'hs_deg_rate':high_school,
        'college_deg_rate':college
    })

In [18]:
# output county data to csv file

# df_county_data.to_csv('data/sel_county_out.csv')