In [1]:
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from time import sleep
import json
import os
import pandas as pd

## Web Scraping for COVID Data: USAFacts.org

In [2]:
#USAFacts site
url = 'https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/'
state_covid_page = requests.get(url)

In [3]:
state_covid_page.status_code

200

In [6]:
state_covid_soup = BeautifulSoup(state_covid_page.content, 'lxml')
#print(state_covid_soup.prettify())

In [7]:
title = state_covid_soup.find('title')
print(title.text)

Coronavirus Locations: COVID-19 Map by County and State | USAFacts


In [8]:
# We want to get data from the table for each state.
all_tables = state_covid_soup.find_all('table')
# all_tables
len(all_tables)

1

In [10]:
# Extract the table.
the_table = all_tables[0]
# type(the_table)
#the_table

In [11]:
p = state_covid_soup.find('p', class_='jss26')
string_list= p.text.split()
month = string_list[2]
day = string_list[3].strip()
year = string_list[-1]
the_date = f'{year}-{month}-{day}'.split(',')
date = the_date[0]
date

'2020-Aug-31'

In [12]:
# Lets get the name of each state.
#states = [a.string for a in the_table.find_all('a')]
#len(states)

In [13]:
# Extract the row of data
date_list = []
state_list = []
confirmed_state_list = []
deaths_state_list = []

all_trs = [tr for tr in the_table.find_all('tr')]

for tr in all_trs[1:]:
    # Name of state
    state = tr.a.text
    state_list.append(state)
    
    # COVID cases and deaths
    all_tds = [td for td in tr.find_all('td')]
    confirmed = all_tds[0].text
    deaths = all_tds[1].text
    confirmed_state_list.append(confirmed)
    deaths_state_list.append(deaths)
    
    date_list.append(date)

In [14]:
# Create a dictionary with the information.
covid_state_dict = {}
covid_state_dict["state"] = state_list
covid_state_dict["confirmed"] = confirmed_state_list
covid_state_dict["deaths"] = deaths_state_list
covid_state_dict["date"] = date_list

## COVID: State Level

In [15]:
# Create the DataFrame.
covid_state_df = pd.DataFrame(data = covid_state_dict)
covid_state_df.head()

Unnamed: 0,state,confirmed,deaths,date
0,Alabama,85762,1565,2020-Aug-31
1,Alaska,2879,23,2020-Aug-31
2,Arizona,170798,3626,2020-Aug-31
3,Arkansas,41759,442,2020-Aug-31
4,California,492310,9008,2020-Aug-31


## Now scrap the url for each county in the US

In [16]:
# These are the url's for each state to link to the HTML for the county level data.
base_url = 'https://usafacts.org'
all_state_url = [base_url+a['href'] for a in the_table.find_all('a')]
#all_state_url

In [17]:
# Initialize the lists for the Pandas DateFrame.
date_county_list = []
county_list = []
the_state_list =[]
confirmed_county_list = []
deaths_county_list = []
confirmed_per_100k_list = []

# Loop through every county of each state.
for state_url in all_state_url:
    state_county_page = requests.get(state_url)
    state_county_soup = BeautifulSoup(state_county_page.content, 'lxml')
    
    # The date that data was extracted.
    string_county_list= state_county_soup.find('p').text.split()
    month_county = string_county_list[2]
    day_county = string_county_list[3].strip()
    year_county = string_county_list[-1]
    the_date_county = f'{year}-{month}-{day}'.split(',')
    date_county = the_date[0]
    
    # Parse the state_county_soup for table information.
    all_county_tables = state_county_soup.find_all('table')
    #len(all_county_tables)

    # Extract the table.
    the_county_table = all_county_tables[0]
    # type(the_table)

    all_county_trs = [tr for tr in the_county_table.tbody.find_all('tr')]

    # Extract the state name from the url.
    state_of_county = all_county_trs[0].a['href'].split('/')[-3]
    
    for county_tr in all_county_trs:
        # Name of each county.
        county = county_tr.a.text
        county_list.append(county)
        
        # County COVID cases, deaths and Confirmed Per 100k
        all_county_tds = [td for td in county_tr.find_all('td')]
        county_confirmed = all_county_tds[0].text
        county_deaths = all_county_tds[1].text
        county_confirmed_per_100k = all_county_tds[2].text
        confirmed_county_list.append(county_confirmed)
        deaths_county_list.append(county_deaths)
        confirmed_per_100k_list.append(county_confirmed_per_100k)

        the_state_list.append(state_of_county.capitalize())
        date_county_list.append(date_county)
    
    # Create a dictionary with the information.
    covid_county_dict = {}
    covid_county_dict["county"] = county_list
    covid_county_dict["state"] = the_state_list
    covid_county_dict["confirmed"] = confirmed_county_list
    covid_county_dict["deaths"] = deaths_county_list
    covid_county_dict["confirmed_per_100k"] = confirmed_per_100k_list
    covid_county_dict["date"] = date_county_list
    
    #break

## COVID: County Level

In [18]:
# Create the DataFrame.
covid_county_df = pd.DataFrame(data = covid_county_dict)
covid_county_df.head()

Unnamed: 0,county,state,confirmed,deaths,confirmed_per_100k,date
0,Autauga County,Alabama,1002,21,1793.5,2020-Aug-31
1,Baldwin County,Alabama,3028,21,1356.4,2020-Aug-31
2,Barbour County,Alabama,585,5,2369.8,2020-Aug-31
3,Bibb County,Alabama,352,2,1571.8,2020-Aug-31
4,Blount County,Alabama,731,3,1264.1,2020-Aug-31


In [19]:
covid_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   county              3142 non-null   object
 1   state               3142 non-null   object
 2   confirmed           3142 non-null   object
 3   deaths              3142 non-null   object
 4   confirmed_per_100k  3142 non-null   object
 5   date                3142 non-null   object
dtypes: object(6)
memory usage: 147.4+ KB


## COVID: Race/Ethnicity Demographics
### The COVID Tracking Project:  The COVID Racial Data Tracker.

In [20]:
# Load the demographic data Race_Data _Entry _CRDT_utf_8 .csv file.
the_filepath = os.path.join(".", "Resources", "Race_Data _Entry _CRDT_utf_8 .csv")

# Read file and store into Pandas DataFrames
covid_race_df = pd.read_csv(the_filepath, encoding = 'utf-8')

# Visualize
covid_race_df.head()

Unnamed: 0,date,state,cases_total,cases_white,cases_black,cases_latinX,cases_asian,cases_AIAN,cases_NHPI,cases_multiracial,...,deaths_latinX,deaths_asian,deaths_AIAN,deaths_NHPI,deaths_multiracial,deaths_other,deaths_unknown,deaths_ethnicity_hispanic,deaths_ethnicity_non_hispanic,deaths_ethnicity_unknown
0,20200729,AK,2797.0,852.0,81.0,,83.0,366.0,103.0,68.0,...,,2.0,8.0,1.0,0.0,0.0,0.0,0.0,22.0,0.0
1,20200729,AL,83782.0,25915.0,23239.0,,309.0,,,,...,,4.0,,,,29.0,103.0,48.0,1267.0,223.0
2,20200729,AR,40968.0,20527.0,8673.0,,587.0,120.0,2103.0,,...,,6.0,2.0,31.0,,30.0,0.0,43.0,394.0,0.0
3,20200729,AS,,,,,,,,,...,,,,,,,,,,
4,20200729,AZ,168273.0,29227.0,3822.0,38887.0,1309.0,9631.0,,,...,946.0,42.0,430.0,,,57.0,503.0,946.0,2005.0,503.0


In [37]:
# Cases by race/ethnicity
cases_ethnicity_df = covid_race_df.filter(['date', 'state', 'cases_total', 'cases_white', 'cases_black',
       'cases_latinX', 'cases_asian', 'cases_AIAN', 'cases_NHPI',
       'cases_multiracial', 'cases_other', 'cases_unknown',
       'cases_ethnicity_hispanic', 'cases_ethnicity_nonHispanic',
       'cases_ethnicity_unknown'])
cases_ethnicity_df

Unnamed: 0,date,state,cases_total,cases_white,cases_black,cases_latinX,cases_asian,cases_AIAN,cases_NHPI,cases_multiracial,cases_other,cases_unknown,cases_ethnicity_hispanic,cases_ethnicity_nonHispanic,cases_ethnicity_unknown
0,20200729,AK,2797,852,81.0,,83.0,366.0,103.0,68.0,48.0,1196.0,150.0,1215.0,1432.0
1,20200729,AL,83782,25915,23239.0,,309.0,,,,4527.0,29789.0,5832.0,40720.0,37228.0
2,20200729,AR,40968,20527,8673.0,,587.0,120.0,2103.0,,5456.0,3502.0,9493.0,31475.0,0.0
3,20200729,AS,,,,,,,,,,,,,
4,20200729,AZ,168273,29227,3822.0,38887.0,1309.0,9631.0,,,3978.0,81419.0,38887.0,47967.0,81419.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,20200412,VT,,,,,,,,,,,,,
1788,20200412,WA,10411,2903,289.0,1180.0,451.0,41.0,61.0,112.0,104.0,5270.0,1180.0,3961.0,5270.0
1789,20200412,WI,3341,1680,857.0,,81.0,28.0,,,74.0,621.0,361.0,2261.0,718.0
1790,20200412,WV,,,,,,,,,,,,,


In [38]:
# Deaths by race/ethnicity

deaths_ethnicity_df = covid_race_df.filter(['date', 'state', 'deaths_total', 'deaths_white',
       'deaths_black', 'deaths_latinX', 'deaths_asian', 'deaths_AIAN',
       'deaths_NHPI', 'deaths_multiracial', 'deaths_other', 'deaths_unknown',
       'deaths_ethnicity_hispanic', 'deaths_ethnicity_non_hispanic',
       'deaths_ethnicity_unknown'])
deaths_ethnicity_df

Unnamed: 0,date,state,deaths_total,deaths_white,deaths_black,deaths_latinX,deaths_asian,deaths_AIAN,deaths_NHPI,deaths_multiracial,deaths_other,deaths_unknown,deaths_ethnicity_hispanic,deaths_ethnicity_non_hispanic,deaths_ethnicity_unknown
0,20200729,AK,22.0,11.0,0.0,,2.0,8.0,1.0,0.0,0.0,0.0,0.0,22.0,0.0
1,20200729,AL,1538.0,779.0,623.0,,4.0,,,,29.0,103.0,48.0,1267.0,223.0
2,20200729,AR,434.0,255.0,113.0,,6.0,2.0,31.0,,30.0,0.0,43.0,394.0,0.0
3,20200729,AS,,,,,,,,,,,,,
4,20200729,AZ,3454.0,1372.0,104.0,946.0,42.0,430.0,,,57.0,503.0,946.0,2005.0,503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,20200412,VT,,,,,,,,,,,,,
1788,20200412,WA,508.0,290.0,9.0,24.0,36.0,5.0,1.0,3.0,7.0,133.0,24.0,351.0,133.0
1789,20200412,WI,144.0,77.0,61.0,,4.0,1.0,,0.0,0.0,1.0,4.0,138.0,2.0
1790,20200412,WV,,,,,,,,,,,,,


## Using World Population Review API: JSON

In [21]:
# Set up the URL will be using to retrieve the infromation as JSON.
query_url = "https://worldpopulationreview.com/static/states/abbr-name-list.json"

# Get data from API: World Population Review
response = requests.get(query_url)
response_json = response.json()

In [22]:
len(response_json)

51

In [23]:
# Initialize the lists
state_name_list = []
abbr_list = []
abbr_dict = {}

for result in response_json:
    state = result['name']
    abbr = result['abbreviation']
    
    # Append the lists.
    state_name_list.append(state)
    abbr_list.append(abbr)
    
# Create a dictionary with the information.    
abbr_dict["state"] = state_name_list
abbr_dict["abbr"] = abbr_list

# Create the DataFrame.
abbreviation_df = pd.DataFrame(data = abbr_dict)
abbreviation_df.head()

Unnamed: 0,state,abbr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


## United States Census Bureau

In [34]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
filepath = os.path.join(".", "Resources", "Census_Data_utf.csv")

# Read file and store into Pandas DataFrames
census_data_df = pd.read_csv(filepath, encoding = 'utf-8')

# Visualize
census_data_df.head()

Unnamed: 0,FIPS,Geog_Level,State_Code,State_name,County_code,County_name,Lat_State,Lon_State,LAND_AREA,Med_HHD_Inc_ACS_14_18($),...,pct_Pov_Univ_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,pct_One_Health_Ins_ACS_14_18,pct_No_Health_Ins_ACS_14_18,pct_NoHealthIns_65P_ACS_14_18,pct_Pop_NoCompDevic_ACS_14_18,pct_Pop_w_BroadComp_ACS_14_18,pct_HHD_NoCompDevic_ACS_14_18,pct_HHD_No_Internet_ACS_14_18,pct_HHD_w_Broadband_ACS_14_18
0,1000,State,1,Alabama,0,All counties,32.7794,-86.8287,50645.319,48486.0,...,97.48,17.49,69.93,9.82,0.41,11.71,77.81,16.61,22.4,56.41
1,1001,County,1,Alabama,1,Autauga County,,,594.437,58786.0,...,99.21,15.38,72.36,7.02,0.0,8.18,84.0,13.01,19.03,61.97
2,1003,County,1,Alabama,3,Baldwin County,,,1589.786,55962.0,...,98.47,10.57,69.54,10.03,0.41,7.21,83.03,11.43,16.83,57.24
3,1005,County,1,Alabama,5,Barbour County,,,884.875,34186.0,...,88.65,28.86,58.07,9.92,0.29,18.02,64.92,23.95,34.11,38.84
4,1007,County,1,Alabama,7,Bibb County,,,622.581,45340.0,...,90.86,13.99,65.9,7.19,0.0,17.2,72.83,23.73,29.24,33.42


In [35]:
# Drop some columns.
census_data_df.drop(["State_Code", "County_code", 'Lat_State', 'Lon_State'], axis = 1, inplace = True)
census_data_df.columns = ['fips', 'level', 'state', 'county', 'land_area',
       'med_HHD_Inc_ACS_14_18($)', 'aggregate_HH_INC_ACS_14_18($)',
       'tot_Pop_CEN_2010', 'tot_Pop_ACS_14_18', 'pop_65plus_ACS_14_18',
       'nh_Blk_alone_ACS_14_18', 'pov_Univ_ACS_14_18',
       'prs_Blw_Pov_Lev_ACS_14_18', 'one_Health_Ins_ACS_14_18',
       'no_Health_Ins_ACS_14_18', 'pct_Pov_Univ_ACS_14_18',
       'pct_Prs_Blw_Pov_Lev_ACS_14_18', 'pct_One_Health_Ins_ACS_14_18',
       'pct_No_Health_Ins_ACS_14_18', 'pct_NoHealthIns_65P_ACS_14_18',
       'pct_Pop_NoCompDevic_ACS_14_18', 'pct_Pop_w_BroadComp_ACS_14_18',
       'pct_HHD_NoCompDevic_ACS_14_18', 'pct_HHD_No_Internet_ACS_14_18',
       'pct_HHD_w_Broadband_ACS_14_18']
census_data_df.head()

Unnamed: 0,fips,level,state,county,land_area,med_HHD_Inc_ACS_14_18($),aggregate_HH_INC_ACS_14_18($),tot_Pop_CEN_2010,tot_Pop_ACS_14_18,pop_65plus_ACS_14_18,...,pct_Pov_Univ_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,pct_One_Health_Ins_ACS_14_18,pct_No_Health_Ins_ACS_14_18,pct_NoHealthIns_65P_ACS_14_18,pct_Pop_NoCompDevic_ACS_14_18,pct_Pop_w_BroadComp_ACS_14_18,pct_HHD_NoCompDevic_ACS_14_18,pct_HHD_No_Internet_ACS_14_18,pct_HHD_w_Broadband_ACS_14_18
0,1000,State,Alabama,All counties,50645.319,48486.0,125091000000.0,4779736.0,4864680.0,783832.0,...,97.48,17.49,69.93,9.82,0.41,11.71,77.81,16.61,22.4,56.41
1,1001,County,Alabama,Autauga County,594.437,58786.0,1594492000.0,54571.0,55200.0,8050.0,...,99.21,15.38,72.36,7.02,0.0,8.18,84.0,13.01,19.03,61.97
2,1003,County,Alabama,Baldwin County,1589.786,55962.0,6070565000.0,182265.0,208107.0,40665.0,...,98.47,10.57,69.54,10.03,0.41,7.21,83.03,11.43,16.83,57.24
3,1005,County,Alabama,Barbour County,884.875,34186.0,440096500.0,27457.0,25782.0,4634.0,...,88.65,28.86,58.07,9.92,0.29,18.02,64.92,23.95,34.11,38.84
4,1007,County,Alabama,Bibb County,622.581,45340.0,400338900.0,22915.0,22527.0,3661.0,...,90.86,13.99,65.9,7.19,0.0,17.2,72.83,23.73,29.24,33.42


In [32]:
print(census_data_df.columns)

Index(['FIPS', 'Geog_Level', 'State_name', 'County_name', 'LAND_AREA',
       'Med_HHD_Inc_ACS_14_18($)', 'Aggregate_HH_INC_ACS_14_18($)',
       'Tot_Pop_CEN_2010', 'Tot_Pop_ACS_14_18', 'Pop_65plus_ACS_14_18',
       'NH_Blk_alone_ACS_14_18', 'Pov_Univ_ACS_14_18',
       'Prs_Blw_Pov_Lev_ACS_14_18', 'One_Health_Ins_ACS_14_18',
       'No_Health_Ins_ACS_14_18', 'pct_Pov_Univ_ACS_14_18',
       'pct_Prs_Blw_Pov_Lev_ACS_14_18', 'pct_One_Health_Ins_ACS_14_18',
       'pct_No_Health_Ins_ACS_14_18', 'pct_NoHealthIns_65P_ACS_14_18',
       'pct_Pop_NoCompDevic_ACS_14_18', 'pct_Pop_w_BroadComp_ACS_14_18',
       'pct_HHD_NoCompDevic_ACS_14_18', 'pct_HHD_No_Internet_ACS_14_18',
       'pct_HHD_w_Broadband_ACS_14_18'],
      dtype='object')


## FIPS State

In [26]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
state_filepath = os.path.join(".", "Resources", "state_centroid_utf_8.csv")

# Read file and store into Pandas DataFrames
fips_state_df = pd.read_csv(state_filepath, encoding = 'utf-8')

# Visualize
fips_state_df.head()

Unnamed: 0,state,fips_state,latitude,longitude
0,Alabama,1.0,32.7794,-86.8287
1,Alaska,2.0,64.0685,-152.2782
2,Arizona,4.0,34.2744,-111.6602
3,Arkansas,5.0,34.8938,-92.4426
4,California,6.0,37.1841,-119.4696


## FIPS County

In [27]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
county_filepath = os.path.join(".", "Resources", "county_centroid_utf_8.csv")

# Read file and store into Pandas DataFrames
fips_county_df = pd.read_csv(county_filepath, encoding = 'utf-8')

# Visualize
fips_county_df.head()


Unnamed: 0,fips_county,state,county,latitude,longitude
0,1001,AL,Autauga,32.536382,-86.6445
1,1003,AL,Baldwin,30.659218,-87.7461
2,1005,AL,Barbour,31.87067,-85.4055
3,1007,AL,Bibb,33.015893,-87.1271
4,1009,AL,Blount,33.977448,-86.5672


In [29]:
fips_county_df.columns = ['fips', 'state', 'county', 'latitude', 'longitude']
fips_county_df

Unnamed: 0,fips,state,county,latitude,longitude
0,1001,AL,Autauga,32.536382,-86.6445
1,1003,AL,Baldwin,30.659218,-87.7461
2,1005,AL,Barbour,31.870670,-85.4055
3,1007,AL,Bibb,33.015893,-87.1271
4,1009,AL,Blount,33.977448,-86.5672
...,...,...,...,...,...
3138,56037,WY,Sweetwater,41.660339,-108.8757
3139,56039,WY,Teton,44.049321,-110.5881
3140,56041,WY,Uinta,41.284726,-110.5589
3141,56043,WY,Washakie,43.878831,-107.6691


In [28]:
fips_county_df.columns

Index(['fips_county', 'state', 'county', 'latitude', 'longitude'], dtype='object')