# Census Data Import and Cleaning

## Import county code list

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_counties = pd.read_csv('us_county_codes.csv')

In [3]:
df_counties.tail()

Unnamed: 0,FIPS,Name,State
3227,72151,Yabucoa,PR
3228,72153,Yauco,PR
3229,78010,St. Croix,VI
3230,78020,St. John,VI
3231,78030,St. Thomas,VI


In [4]:
df_counties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3232 entries, 0 to 3231
Data columns (total 3 columns):
FIPS     3232 non-null int64
Name     3232 non-null object
State    3232 non-null object
dtypes: int64(1), object(2)
memory usage: 75.8+ KB


In [5]:
df_counties['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'AS', 'MP', 'GU', 'PR',
       'VI'], dtype=object)

In [6]:
# remove US territory data
df_counties = df_counties[(df_counties['State'] != 'AS') & (df_counties['State'] != 'MP') 
            & (df_counties['State'] != 'GU') & (df_counties['State'] != 'PR') & (df_counties['State'] != 'VI')]            

In [7]:
df_counties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 0 to 3141
Data columns (total 3 columns):
FIPS     3142 non-null int64
Name     3142 non-null object
State    3142 non-null object
dtypes: int64(1), object(2)
memory usage: 98.2+ KB


In [8]:
df_counties.tail()

Unnamed: 0,FIPS,Name,State
3137,56037,Sweetwater,WY
3138,56039,Teton,WY
3139,56041,Uinta,WY
3140,56043,Washakie,WY
3141,56045,Weston,WY


In [9]:
df_counties['State'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [10]:
# create dictionary of state codes and names

state_name_list = {'AL':'Alabama', 'AK':'Alaska', 'AZ':'Arizona', 'AR':'Arkansas', 
                   'CA':'California', 'CO':'Colorado', 'CT':'Connecticut', 'DE':'Delaware', 
                   'DC':'District of Columbia', 'FL':'Florida', 'GA':'Georgia',
                   'HI':'Hawaii', 'ID':'Idaho', 'IL':'Illinois', 'IN':'Indiana', 'IA':'Iowa', 
                   'KS':'Kansas', 'KY':'Kentucky', 'LA':'Louisiana', 'ME':'Maine', 'MD':'Maryland', 
                   'MA':'Massachusetts', 'MI':'Michigan', 'MN':'Minnesota', 'MS':'Mississippi', 
                   'MO':'Missouri', 'MT':'Montana', 'NE':'Nebraska', 'NV':'Nevada', 'NH':'New Hampshire', 
                   'NJ':'New Jersey', 'NM':'New Mexico', 'NY':'New York', 'NC':'North Carolina', 
                   'ND':'North Dakota', 'OH':'Ohio', 'OK':'Oklahoma', 'OR':'Oregon', 'PA':'Pennsylvania', 
                   'RI':'Rhode Island', 'SC':'South Carolina', 'SD':'South Dakota', 'TN':'Tennessee', 'TX':'Texas', 
                   'UT':'Utah', 'VT':'Vermont', 'VA':'Virginia', 'WA':'Washington', 'WV':'West Virginia', 
                   'WI':'Wisconsin', 'WY':'Wyoming'}

In [11]:
# create function to assign state name for state code input
def state_name(state_code):
    return state_name_list[state_code]

In [12]:
# remove trailing whitespace from column titles
df_counties.columns = df_counties.columns.str.strip()

In [13]:
# create new column for call in correct format for scraping ACS data from website
df_counties['Call'] = df_counties['Name'].map(str) + 'County, ' + df_counties['State'].apply(state_name)

In [14]:
df_counties.head()

Unnamed: 0,FIPS,Name,State,Call
0,1001,Autauga,AL,"Autauga County, Alabama"
1,1003,Baldwin,AL,"Baldwin County, Alabama"
2,1005,Barbour,AL,"Barbour County, Alabama"
3,1007,Bibb,AL,"Bibb County, Alabama"
4,1009,Blount,AL,"Blount County, Alabama"


## Scrape ACS data

In [111]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get("http://www.statsamerica.org/USCP/")


In [106]:
# define function to reformat monetary and count values

def clean_num(num_str):
    return int(num_str.replace('$','').replace(',',''))

In [107]:
# define function to reformat rate values

def clean_rate(rate_str):
    return float(rate_str.replace('%', '').strip())

In [112]:
# collect data for all counties in counties dataframe

# items to retrieve
state = []
county_num = []
county_name = []
county_name_re = []

population = []
growth = []
households = []
labor = []
unemployment = []
per_capita = []
median_income = []
poverty = []
high_school = []
college = []

for index in df_counties.index[0:5]:
    
    # enter formatted county name in search bar
    query = driver.find_element_by_id("zt")
    query.clear()
    query.send_keys(df_counties.iloc[index, 3])
    time.sleep(1)
    query.send_keys(Keys.DOWN)
    query.send_keys(Keys.RETURN)
    
    # retrieve data from populated table
    time.sleep(3)
    state.append(df_counties.iloc[index, 2])
    county_num.append(df_counties.iloc[index, 0])
    county_name.append(df_counties.iloc[index, 1])
    county_name_re.append(driver.find_element_by_xpath('//h2[@class="no_gap"]').text)
    
    pop = driver.find_element_by_xpath('//tr/td[contains(text(), "Population")]/following-sibling::td').text
    growth_rate = driver.find_element_by_xpath('//tr/td[contains(text(), "Growth")]/following-sibling::td').text
    hhs = driver.find_element_by_xpath('//tr/td[contains(text(), "Households")]/following-sibling::td').text
    labor_force = driver.find_element_by_xpath('//tr/td[contains(text(), "Labor Force")]/following-sibling::td').text
    unempl_rate = driver.find_element_by_xpath('//tr/td[contains(text(), "Unemployment Rate")]/following-sibling::td').text
    pcap_inc = driver.find_element_by_xpath('//tr/td[contains(text(), "Per Capita")]/following-sibling::td').text
    median_hh_inc = driver.find_element_by_xpath('//tr/td[contains(text(), "Median")]/following-sibling::td').text
    pov_rate = driver.find_element_by_xpath('//tr/td[contains(text(), "Poverty")]/following-sibling::td').text
    hschool_rate = driver.find_element_by_xpath('//tr/td[contains(text(), "High School")]/following-sibling::td').text
    bach_rate = driver.find_element_by_xpath('//tr/td[contains(text(), "Bachelor")]/following-sibling::td').text
    
    
    # clean and append retrieved data to lists
    population.append(clean_num(pop))
    growth.append(clean_rate(growth_rate))
    households.append(clean_num(hhs))
    labor.append(clean_num(labor_force))
    unemployment.append(clean_rate(unempl_rate))
    per_capita.append(clean_num(pcap_inc))
    median_income.append(clean_num(median_hh_inc))
    poverty.append(clean_rate(pov_rate))
    high_school.append(clean_rate(hschool_rate))
    college.append(clean_rate(bach_rate))

In [113]:
population

[55504, 212628, 25270, 22668, 58013]

In [114]:
median_income

[54487, 56460, 32884, 43079, 47213]