In [1]:
#First, import the libraries we will be using for scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup

Now, we will scrape https://www.health.pa.gov/topics/disease/Pages/Coronavirus.aspx to get numbers on confirmed cases and deaths in PA by county.

Changed to this page as of 3/19: https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx

Logic changed on 4/21 due to additional changes made on the pa website and the fact that all county names are now present on the table at the health.pa.gov coronavirus web page

In [2]:
html_confirmed = urlopen('https://www.health.pa.gov/topics/disease/coronavirus/Pages/Cases.aspx')
bysc_confirmed = BeautifulSoup(html_confirmed.read(), "lxml")

In [3]:
#Create empty containers for storing column values
counties_with_confirmed_cases = []
confirmed_case_count = []
confirmed_death_count = []

In [4]:
#Needed to adjust to handle changes made to the PA website. There are now multiple tables on their COVID-19 page
all_tables = bysc_confirmed.find_all('table')

#Grab the table with the information we want
confirmed_table = all_tables[3]

#isolate the information on counties and confirmed cases within the table
table_tags = confirmed_table.find_all('td')

In [5]:
#Find the confirmed cases and deaths by county below
col_idx = 1 #For isolating which column I'm iterating through

#First 4 items are just headings, skip past them and iterate through all other rows
#idx is used to isolate the column
for tag in table_tags[4:]:
    item = tag.text.strip()
    #Strip off leading chars for some table items that cause issues
    if item.find("\u200b") != -1:
        item = item.replace("\u200b", "")
    if item.find("\xa0") != -1:
        item = item.replace("\xa0", "")
    #Make additions to apprpriate list
    if col_idx == 1:
        counties_with_confirmed_cases.append(item)
    if col_idx == 2:
        confirmed_case_count.append(item)
    if col_idx == 4:
        confirmed_death_count.append(item)
        col_idx = 0
    #increment my column identifier
    col_idx += 1

In [6]:
#Now, import numpy and pandas for joining all this information into dataframe
import numpy as np
import pandas as pd
import datetime

In [7]:
#Create the pandas dataframe
df = pd.DataFrame({'County':counties_with_confirmed_cases,
                   'State': ['PA' for county in counties_with_confirmed_cases],
                   'Cumulative Confirmed Cases': confirmed_case_count,
                   'Cumulative Confirmed Deaths' : confirmed_death_count          
})

df.insert(4, "As of", str(pd.Timestamp.now())[:10]) 


In [8]:
#Check the contents of the dataframe
df

Unnamed: 0,County,State,Cumulative Confirmed Cases,Cumulative Confirmed Deaths,As of
0,Adams,PA,85,1,2020-04-21
1,Allegheny,PA,1042,55,2020-04-21
2,Armstrong,PA,38,2,2020-04-21
3,Beaver,PA,298,36,2020-04-21
4,Bedford,PA,15,1,2020-04-21
5,Berks,PA,1945,74,2020-04-21
6,Blair,PA,14,0,2020-04-21
7,Bradford,PA,25,1,2020-04-21
8,Bucks,PA,1812,90,2020-04-21
9,Butler,PA,161,5,2020-04-21


In [9]:
#Export the dataframe to csv
#Grab the date from the dataframe 'As of' column
date = df['As of'][0]
#df.to_csv('data/confirmed_cases_by_county_' + date + '.csv', header=False)