In [21]:
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from time import sleep
import json
import os
import pandas as pd

## Web Scraping for COVID Data: USAFacts.org

In [3]:
#USAFacts site
url = 'https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/'
state_covid_page = requests.get(url)

In [4]:
state_covid_page.status_code

200

In [5]:
state_covid_soup = BeautifulSoup(state_covid_page.content, 'lxml')

In [6]:
print(state_covid_soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title data-react-helmet="true">
   Coronavirus Locations: COVID-19 Map by County and State | USAFacts
  </title>
  <meta content="IE=edge" data-react-helmet="true" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8" data-react-helmet="true"/>
  <meta content="width=device-width, initial-scale=1" data-react-helmet="true" name="viewport"/>
  <meta content="en" data-react-helmet="true" property="og:locale"/>
  <meta content="#015193" data-react-helmet="true" name="theme-color"/>
  <meta content="Follow new cases found each day and the total number of cases and deaths in the US. The county-level tracker makes it easy to follow COVID-19 cases on a granular level, as does the ability to break down infections per 100,000 people." data-react-helmet="true" name="description"/>
  <meta content="" data-react-helmet="true" name="keywords"/>
  <meta content="summary_large_image" data-react-helmet="true" name="twitter:card"/>
  <meta content="@usafacts"

In [7]:
title = state_covid_soup.find('title')
print(title.text)

Coronavirus Locations: COVID-19 Map by County and State | USAFacts


In [8]:
# We want to get data from the table for each state.
all_tables = state_covid_soup.find_all('table')
# all_tables
len(all_tables)

1

In [9]:
# Extract the table.
the_table = all_tables[0]
# type(the_table)
the_table

<table aria-label="a table of data" class="MuiTable-root-45 jss42"><thead class="MuiTableHead-root-47"><tr class="MuiTableRow-root-48 MuiTableRow-head-50"><th class="MuiTableCell-root-52 MuiTableCell-head-53 jss43 MuiTableCell-sizeSmall-56" scope="col">State</th><th class="MuiTableCell-root-52 MuiTableCell-head-53 jss43 MuiTableCell-sizeSmall-56" scope="col">Confirmed</th><th class="MuiTableCell-root-52 MuiTableCell-head-53 jss43 MuiTableCell-sizeSmall-56" scope="col">Deaths</th></tr></thead><tbody class="MuiTableBody-root-64"><tr class="MuiTableRow-root-48"><th class="MuiTableCell-root-52 MuiTableCell-body-54 jss43 MuiTableCell-sizeSmall-56" role="cell" scope="row"><a aria-current="page" class="jss41" href="/visualizations/coronavirus-covid-19-spread-map/state/alabama">Alabama</a></th><td class="MuiTableCell-root-52 MuiTableCell-body-54 MuiTableCell-sizeSmall-56" role="cell" scope="col">82,366</td><td class="MuiTableCell-root-52 MuiTableCell-body-54 MuiTableCell-sizeSmall-56" role="ce

In [10]:
p = state_covid_soup.find('p', class_='jss26')
string_list= p.text.split()
month = string_list[2]
day = string_list[3].strip()
year = string_list[-1]
the_date = f'{year}-{month}-{day}'.split(',')
date = the_date[0]
date

'2020-Jul-29'

In [11]:
# Lets get the name of each state.
#states = [a.string for a in the_table.find_all('a')]
#len(states)

In [12]:
# Extract the row of data
date_list = []
state_list = []
confirmed_state_list = []
deaths_state_list = []

all_trs = [tr for tr in the_table.find_all('tr')]

for tr in all_trs[1:]:
    # Name of state
    state = tr.a.text
    state_list.append(state)
    
    # COVID cases and deaths
    all_tds = [td for td in tr.find_all('td')]
    confirmed = all_tds[0].text
    deaths = all_tds[1].text
    confirmed_state_list.append(confirmed)
    deaths_state_list.append(deaths)
    
    date_list.append(date)

In [13]:
# Create a dictionary with the information.
covid_state_dict = {}
covid_state_dict["state"] = state_list
covid_state_dict["confirmed"] = confirmed_state_list
covid_state_dict["deaths"] = deaths_state_list
covid_state_dict["date"] = date_list

## COVID: State Level

In [14]:
# Create the DataFrame.
covid_state_df = pd.DataFrame(data = covid_state_dict)
covid_state_df.head()

Unnamed: 0,state,confirmed,deaths,date
0,Alabama,82366,1491,2020-Jul-29
1,Alaska,2730,22,2020-Jul-29
2,Arizona,165934,3408,2020-Jul-29
3,Arkansas,40181,428,2020-Jul-29
4,California,471437,8714,2020-Jul-29


## Now scrap the url for each county in the US

In [15]:
# These are the url's for each state to link to the HTML for the county level data.
base_url = 'https://usafacts.org'
all_state_url = [base_url+a['href'] for a in the_table.find_all('a')]
#all_state_url

In [None]:
# Initialize the lists for the Pandas DateFrame.
date_county_list = []
county_list = []
the_state_list =[]
confirmed_county_list = []
deaths_county_list = []
confirmed_per_100k_list = []

# Loop through every county of each state.
for state_url in all_state_url:
    state_county_page = requests.get(state_url)
    state_county_soup = BeautifulSoup(state_county_page.content, 'lxml')
    
    # The date that data was extracted.
    string_county_list= state_county_soup.find('p').text.split()
    month_county = string_county_list[2]
    day_county = string_county_list[3].strip()
    year_county = string_county_list[-1]
    the_date_county = f'{year}-{month}-{day}'.split(',')
    date_county = the_date[0]
    
    # Parse the state_county_soup for table information.
    all_county_tables = state_county_soup.find_all('table')
    #len(all_county_tables)

    # Extract the table.
    the_county_table = all_county_tables[0]
    # type(the_table)

    all_county_trs = [tr for tr in the_county_table.tbody.find_all('tr')]

    # Extract the state name from the url.
    state_of_county = all_county_trs[0].a['href'].split('/')[-3]
    
    for county_tr in all_county_trs:
        # Name of each county.
        county = county_tr.a.text
        county_list.append(county)
        
        # County COVID cases, deaths and Confirmed Per 100k
        all_county_tds = [td for td in county_tr.find_all('td')]
        county_confirmed = all_county_tds[0].text
        county_deaths = all_county_tds[1].text
        county_confirmed_per_100k = all_county_tds[2].text
        confirmed_county_list.append(county_confirmed)
        deaths_county_list.append(county_deaths)
        confirmed_per_100k_list.append(county_confirmed_per_100k)

        the_state_list.append(state_of_county.capitalize())
        date_county_list.append(date_county)
    
    # Create a dictionary with the information.
    covid_county_dict = {}
    covid_county_dict["county"] = county_list
    covid_county_dict["state"] = the_state_list
    covid_county_dict["confirmed"] = confirmed_county_list
    covid_county_dict["deaths"] = deaths_county_list
    covid_county_dict["confirmed_per_100k"] = confirmed_per_100k_list
    covid_county_dict["date"] = date_county_list
    
    #break

## COVID: County Level

In [16]:
# Create the DataFrame.
covid_county_df = pd.DataFrame(data = covid_county_dict)
covid_county_df.head()

Unnamed: 0,county,state,confirmed,deaths,confirmed_per_100k,date
0,Autauga County,Alabama,974,21,1743.4,2020-Jul-29
1,Baldwin County,Alabama,2835,18,1270.0,2020-Jul-29
2,Barbour County,Alabama,575,4,2329.3,2020-Jul-29
3,Bibb County,Alabama,338,2,1509.3,2020-Jul-29
4,Blount County,Alabama,675,1,1167.3,2020-Jul-29


In [17]:
covid_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   county              3142 non-null   object
 1   state               3142 non-null   object
 2   confirmed           3142 non-null   object
 3   deaths              3142 non-null   object
 4   confirmed_per_100k  3142 non-null   object
 5   date                3142 non-null   object
dtypes: object(6)
memory usage: 147.4+ KB


## COVID: Race/Ethnicity Demographics
### The COVID Tracking Project:  The COVID Racial Data Tracker.

In [32]:
# Load the demographic data Race_Data _Entry _CRDT_utf_8 .csv file.
the_filepath = os.path.join(".", "Resources", "Race_Data _Entry _CRDT_utf_8 .csv")

# Read file and store into Pandas DataFrames
covid_race_df = pd.read_csv(the_filepath, encoding = 'utf-8')

# Visualize
covid_race_df.head()

Unnamed: 0,date,state,cases_total,cases_white,cases_black,cases_latinX,cases_asian,cases_AIAN,cases_NHPI,cases_multiracial,...,deaths_latinX,deaths_asian,deaths_AIAN,deaths_NHPI,deaths_multiracial,deaths_other,deaths_unknown,deaths_ethnicity_hispanic,deaths_ethnicity_non_hispanic,deaths_ethnicity_unknown
0,20200729,AK,2797.0,852.0,81.0,,83.0,366.0,103.0,68.0,...,,2.0,8.0,1.0,0.0,0.0,0.0,0.0,22.0,0.0
1,20200729,AL,83782.0,25915.0,23239.0,,309.0,,,,...,,4.0,,,,29.0,103.0,48.0,1267.0,223.0
2,20200729,AR,40968.0,20527.0,8673.0,,587.0,120.0,2103.0,,...,,6.0,2.0,31.0,,30.0,0.0,43.0,394.0,0.0
3,20200729,AS,,,,,,,,,...,,,,,,,,,,
4,20200729,AZ,168273.0,29227.0,3822.0,38887.0,1309.0,9631.0,,,...,946.0,42.0,430.0,,,57.0,503.0,946.0,2005.0,503.0


## Using World Population Review API: JSON

In [23]:
# Set up the URL will be using to retrieve the infromation as JSON.
query_url = "https://worldpopulationreview.com/static/states/abbr-name-list.json"

# Get data from API: World Population Review
response = requests.get(query_url)
response_json = response.json()

In [24]:
len(response_json)

51

In [30]:
# Initialize the lists
state_name_list = []
abbr_list = []
abbr_dict = {}

for result in response_json:
    state = result['name']
    abbr = result['abbreviation']
    
    # Append the lists.
    state_name_list.append(state)
    abbr_list.append(abbr)
    
# Create a dictionary with the information.    
abbr_dict["state"] = state_name_list
abbr_dict["abbr"] = abbr_list

# Create the DataFrame.
abbreviation_df = pd.DataFrame(data = abbr_dict)
abbreviation_df.head()

Unnamed: 0,state,abbr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


## United States Census Bureau

In [31]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
filepath = os.path.join(".", "Resources", "Census_Data_utf.csv")

# Read file and store into Pandas DataFrames
census_data_df = pd.read_csv(filepath, encoding = 'utf-8')

# Visualize
census_data_df.head()

Unnamed: 0,FIPS,Geog_Level,State_Code,State_name,County_code,County_name,Lat_State,Lon_State,LAND_AREA,Med_HHD_Inc_ACS_14_18($),...,pct_Pov_Univ_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,pct_One_Health_Ins_ACS_14_18,pct_No_Health_Ins_ACS_14_18,pct_NoHealthIns_65P_ACS_14_18,pct_Pop_NoCompDevic_ACS_14_18,pct_Pop_w_BroadComp_ACS_14_18,pct_HHD_NoCompDevic_ACS_14_18,pct_HHD_No_Internet_ACS_14_18,pct_HHD_w_Broadband_ACS_14_18
0,1000,State,1,Alabama,0,All counties,32.7794,-86.8287,50645.319,48486.0,...,97.48,17.49,69.93,9.82,0.41,11.71,77.81,16.61,22.4,56.41
1,1001,County,1,Alabama,1,Autauga County,,,594.437,58786.0,...,99.21,15.38,72.36,7.02,0.0,8.18,84.0,13.01,19.03,61.97
2,1003,County,1,Alabama,3,Baldwin County,,,1589.786,55962.0,...,98.47,10.57,69.54,10.03,0.41,7.21,83.03,11.43,16.83,57.24
3,1005,County,1,Alabama,5,Barbour County,,,884.875,34186.0,...,88.65,28.86,58.07,9.92,0.29,18.02,64.92,23.95,34.11,38.84
4,1007,County,1,Alabama,7,Bibb County,,,622.581,45340.0,...,90.86,13.99,65.9,7.19,0.0,17.2,72.83,23.73,29.24,33.42


In [None]:
all_county_tables = state_county_soup.find_all('table')
len(all_county_tables)
# Extract the table.
the_county_table = all_county_tables[0]
# type(the_table)

all_county_trs = [tr for tr in the_county_table.tbody.find_all('tr')]

# Extract the state name from the url.
state_of_county = all_county_trs[0].a['href'].split('/')[-3]

for county_tr in all_county_trs:
    # Name of county
    county = county_tr.a.text
    county_list.append(county)
    
    # County COVID cases, deaths and Confirmed Per 100k
    all_county_tds = [td for td in county_tr.find_all('td')]
    county_confirmed = all_county_tds[0].text
    county_deaths = all_county_tds[1].text
    county_confirmed_per_100k = all_county_tds[2].text
    confirmed_county_list.append(county_confirmed)
    deaths_county_list.append(county_deaths)
    confirmed_per_100k_list.append(county_confirmed_per_100k)
    
    the_state_list.append(state_of_county.capitalize())
    date_county_list.append(date_county)
        
# Create a dictionary with the information.
covid_county_dict = {}
covid_county_dict["county"] = county_list
covid_county_dict["state"] = the_state_list
covid_county_dict["confirmed"] = confirmed_county_list
covid_county_dict["deaths"] = deaths_county_list
covid_county_dict["confirmed_per_100k"] = confirmed_per_100k_list
covid_county_dict["date"] = date_county_list

# Create the DataFrame.
covid_county_df = pd.DataFrame(data = covid_county_dict)


covid_county_df   #.head()


In [None]:
all_county_trs[0].a['href'].split('/')[-3].title()

In [None]:
print(the_county_table.find('tr'))