# Selenium DOC dashboard scraper

THis notebook documents and executes the collection of daily COVID-19 data from the Pennsylvania DOC COVID-19 dashboard. Data is scraped from the DOC dashboard using Selenium and is then processed to conform to a standard dataframe format. 

Data fields collected in this notebook are only for incarcerated people (page 5 of the DOC dashboard), including positive tests, negative tests, pending tests, recovered cases and incarcerated person deaths. 


### Data collection procedure:
1. route selenium driver to DOC dashboard url
2. navigate driver to page 5 of dashboard
3. select date and SCI from dropdown menus
4. isolate and scrape relevant data fields
5. format and store relevant data fields

**NOTE** functions are defined out of order to allow for proper order of operations in loops

In [589]:
# import libraries

import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib import request
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys

from xml.etree.ElementTree import XML, fromstring

import time
import re

### 1. route selenium driver to DOC dashboard URL

In [665]:
# start up selenium
dash_url = "https://app.powerbigov.us/view?r=eyJrIjoiNWQ5YTQ4ZWUtY2NjMi00ZWRhLTgyNWQtYzAzNzc5NmYwMGIyIiwidCI6IjQxOGUyODQxLTAxMjgtNGRkNS05YjZjLTQ3ZmM1YTlhMWJkZSJ9"
driver = webdriver.Safari()
driver.get(dash_url)

# some div selectors

# change 12/21/20 - for sci_dropdown to "nth-child(49)" from "nth-child(46)" ??

sci_dropdown_scroll_area = 'document.querySelector("body > div:nth-child(49) > div.slicer-dropdown-content > div > div.slicerBody > div > div.scrollbar-inner.scroll-content.scroll-scrolly_visible")'
date_dropdown_scroll_area = 'document.querySelector("body > div:nth-child(47) > div.slicer-dropdown-content > div > div.slicerBody > div > div.scrollbar-inner.scroll-content.scroll-scrolly_visible")'

In [666]:
# tracking data
all_data = []
bad_dates = []
date_list = []

### 2. navigate driver to page 5 of dashboard

In [429]:
def go_to_page_5():
    for i in range(4):
        driver.execute_script("document.querySelector('[title=\"Next Page\"]').click()")


### 4. Isolate and scrape content of svg data

In [430]:
def get_svg_content():
    result = None
    while result is None:
        try:
            d = driver.execute_script(' return document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(6) > transform > div > div:nth-child(3) > div > visual-modern > div > svg")')
            content = [i.get_attribute('innerHTML') for i in d.find_elements_by_tag_name("*")]
            return content
        except:
             pass
    

### 3. select date and SCI from dropdown menus

In [431]:
# open dropdowns

def open_dropdowns():
    # open date dropdown
    driver.execute_script('document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(7) > transform > div > div:nth-child(3) > div > visual-modern > div > div > div.slicer-content-wrapper > div > i").click()')
    # open sci dropdown
    driver.execute_script('document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(5) > transform > div > div:nth-child(3) > div > visual-modern > div > div > div.slicer-content-wrapper > div").click()')

In [432]:
def roll_through_SCI(date):

    time.sleep(0.2)
    
    height = 200
    sci_list = []
    for d_ in range(4):
        
        # scroll to next portion
        driver.execute_script(f'{sci_dropdown_scroll_area}.scrollTop = {height*d_}')
        
        time.sleep(0.1)
        
        # parent node
        sci_scroll_area = driver.execute_script(f'return {sci_dropdown_scroll_area}')
        
        # sometimes it misses the selection, so try again
        try:
            current_sci = sci_scroll_area.find_elements_by_class_name('slicerText')
            current_sci[0].get_attribute('title')
        except:
            current_sci = sci_scroll_area.find_elements_by_class_name('slicerText')

        # for ever sci click it and run the svg collector
        for i in current_sci:
            sci = i.get_attribute('title')
            if sci not in sci_list:
                sci_list.append(sci)
                driver.execute_script(f"document.querySelector('[title=\"{sci}\"]').click()")
                
                # get data
                sci_dict = {"content":get_svg_content()}
                sci_dict['sci'] = sci
                sci_dict['date'] = date
                
                # save data
                all_data.append(sci_dict)


In [433]:

def roll_through_dates():
    height = 180
    for d in range(50):
        # 50 seems to capture everything
        
        # scroll to the next portion
        driver.execute_script(f'{date_dropdown_scroll_area}.scrollTop = {height*d}')
        
        # parent node
        date_scroll_area = driver.execute_script(f'return {date_dropdown_scroll_area}')

        # sometimes it misses the selection, so try again
        try:
            current_dates = date_scroll_area.find_elements_by_class_name('slicerText')
            current_dates[0].get_attribute('title')
        except:

            current_dates = date_scroll_area.find_elements_by_class_name('slicerText')
        
        # go through every date and click if it isn't already recorded
        for i in current_dates:
            date = i.get_attribute('title')
            if date not in date_list:
                date_list.append(date)
                driver.execute_script(f"document.querySelector('[title=\"{date}\"]').click()")

                # roll through all sci
                roll_through_SCI(date)
            if i == "(Blank)": # stop everything for messy code below
                return True


In [667]:
go_to_page_5() # go to page 5

In [668]:
open_dropdowns() # open the dropdown menus

### DANGER, this is bad code and is embarrassing. It was stopped and started multiple times to get all data. bad bad bad 

In [549]:
done = None
cnt=0
while not done:
    try:
        val = roll_through_dates()
        cnt+=1
        if cnt>10:
            done = True
    except Exception as E:
        print(E)
        pass


HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x114e08908>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11532c320>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11532c940>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF

HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x115600470>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x115717320>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x115340da0>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF

HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11532cc88>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x112dd72b0>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF8-CCD2-4AA2-A230-2362EA50F04E/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1023cd400>: Failed to establish a new connection: [Errno 61] Connection refused'))
HTTPConnectionPool(host='127.0.0.1', port=62051): Max retries exceeded with url: /session/8688FAF

KeyboardInterrupt: 

In [669]:
date='1/8/2021'

In [633]:
driver.execute_script(f"document.querySelector('[title=\"{date}\"]').click()")

In [438]:
sci_list=['ALBION',
 'BENNER TOWNSHIP',
 'CAMBRIDGE SPRINGS',
 'CAMP HILL',
 'CENTRAL OFFICE',
 'CHESTER',
 'COAL TOWNSHIP',
 'DALLAS',
 'FAYETTE',
 'FOREST',
 'FRACKVILLE',
 'GREENE',
 'HOUTZDALE',
 'HUNTINGDON',
 'LAUREL HIGHLANDS',
 'MAHANOY',
 'MERCER',
 'MUNCY',
 'PHOENIX',
 'PINE GROVE',
 'QUEHANNA BOOTCAMP',
 'RETREAT',
 'ROCKVIEW',
 'SMITHFIELD',
 'SOMERSET',
 'WAYMART']

In [331]:
'''
for sci in sci_list:
    driver.execute_script(f"document.querySelector('[title=\"{sci}\"]').click()")
    # get data
    sci_dict = {"content":get_svg_content()}
    sci_dict['sci'] = sci
    sci_dict['date'] = date

    # save data
    all_data.append(sci_dict)
'''

'\nfor sci in sci_list:\n    driver.execute_script(f"document.querySelector(\'[title="{sci}"]\').click()")\n    # get data\n    sci_dict = {"content":get_svg_content()}\n    sci_dict[\'sci\'] = sci\n    sci_dict[\'date\'] = date\n\n    # save data\n    all_data.append(sci_dict)\n'

In [670]:
roll_through_SCI(date)

In [671]:
for sci in all_data:
    print(sci['sci'])
    elem=fromstring('<svg>{}</svg>'.format(sci['content'][1]))
    for p in elem.findall('g/path'):
        c, v = p.attrib['aria-label'].split()[:2]
        v=v.replace(',','')
        sci[c]=v
    print()

ALBION

BENNER TOWNSHIP

CAMBRIDGE SPRINGS

CAMP HILL

CENTRAL OFFICE

CHESTER

COAL TOWNSHIP

DALLAS

FAYETTE

FOREST

FRACKVILLE

GREENE

HOUTZDALE

HUNTINGDON

LAUREL HIGHLANDS

MAHANOY

MERCER

MUNCY

PHOENIX

PINE GROVE

QUEHANNA BOOTCAMP

RETREAT

ROCKVIEW

SMITHFIELD

SOMERSET

WAYMART



In [672]:
update_df=pd.DataFrame(all_data)

In [673]:
update_df=update_df.drop(columns='content')
update_df['sci']

0                ALBION
1       BENNER TOWNSHIP
2     CAMBRIDGE SPRINGS
3             CAMP HILL
4        CENTRAL OFFICE
5               CHESTER
6         COAL TOWNSHIP
7                DALLAS
8               FAYETTE
9                FOREST
10           FRACKVILLE
11               GREENE
12            HOUTZDALE
13           HUNTINGDON
14     LAUREL HIGHLANDS
15              MAHANOY
16               MERCER
17                MUNCY
18              PHOENIX
19           PINE GROVE
20    QUEHANNA BOOTCAMP
21              RETREAT
22             ROCKVIEW
23           SMITHFIELD
24             SOMERSET
25              WAYMART
Name: sci, dtype: object

In [674]:
update_df['sci']=update_df['sci'].str.title()

In [675]:
update_df

Unnamed: 0,sci,date,NEGATIVE,POSITIVE,PENDING,RECOVERED,DEATH
0,Albion,1/8/2021,652.0,14.0,66.0,11.0,
1,Benner Township,1/8/2021,354.0,173.0,81.0,145.0,3.0
2,Cambridge Springs,1/8/2021,1895.0,688.0,31.0,670.0,1.0
3,Camp Hill,1/8/2021,5400.0,241.0,4405.0,246.0,3.0
4,Central Office,1/8/2021,,,,,
5,Chester,1/8/2021,691.0,192.0,98.0,173.0,8.0
6,Coal Township,1/8/2021,4705.0,374.0,4842.0,248.0,
7,Dallas,1/8/2021,1338.0,1277.0,132.0,259.0,9.0
8,Fayette,1/8/2021,523.0,33.0,159.0,21.0,1.0
9,Forest,1/8/2021,413.0,57.0,394.0,25.0,3.0


In [676]:
latest_df = pd.read_csv('https://raw.githubusercontent.com/jmparelman/PA-SCI_COVID19/main/data/latest_data/PA_DOC_testing_data.csv')

In [677]:
latest_df.columns

Index(['date', 'date.1', 'SCI', 'staff_positive', 'staff_negative',
       'staff_pending', 'staff_death', 'staff_recovered',
       'incarcerated_person_positive', 'incarcerated_person_negative',
       'incarcerated_person_pending', 'incarcerated_person_death',
       'incarcerated_person_recovered', 'test_transfer',
       'test_transfer_positive', 'test_release', 'test_release_positive',
       'test_hospital', 'test_hospital_positive', 'test_surveilance',
       'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'date.1.1',
       'staff_positive_new', 'staff_negative_new', 'staff_pending_new',
       'staff_death_new', 'staff_recovered_new',
       'incarcerated_person_positive_new', 'incarcerated_person_negative_new',
       'incarcerated_person_pending_new', 'incarcerated_person_death_new',
       'incarcerated_person_recovered_new', 'test_transfer_new',
       'test_transfer_positive_new', 'test_release_new',
       'test

In [678]:
update_df.columns

Index(['sci', 'date', 'NEGATIVE', 'POSITIVE', 'PENDING', 'RECOVERED', 'DEATH'], dtype='object')

In [679]:
mapcols = {
    'sci': 'SCI', 
    
    'POSITIVE':'incarcerated_person_positive', 
    'NEGATIVE':'incarcerated_person_negative',
    'PENDING':'incarcerated_person_pending', 
    'DEATH':'incarcerated_person_death',
    'RECOVERED':'incarcerated_person_recovered'
    
}

In [680]:
update_df=update_df.rename(columns=mapcols)

In [681]:
def f(s):
    p=s.split('/') 
    return f"{p[2]}-{p[0]}-{p[1]}"

update_df['date']=update_df['date'].apply(f)

In [682]:
l2 = latest_df.copy()

In [683]:
for c in mapcols.values():
    if c!='SCI':
        update_df[c]=update_df[c].astype(float)

In [684]:
l2

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3104,2021-01-07,2021-01-07,Retreat,4.0,10.0,1.0,,4.0,13.0,1048.0,...,,,,,,,,,,
3105,2021-01-07,2021-01-07,Rockview,181.0,124.0,31.0,,197.0,116.0,788.0,...,,,,,,,,,,
3106,2021-01-07,2021-01-07,Smithfield,175.0,194.0,19.0,,155.0,459.0,728.0,...,,,,,,,,,,
3107,2021-01-07,2021-01-07,Somerset,163.0,186.0,15.0,,140.0,604.0,1418.0,...,,,,,,,,,,


In [685]:
new_df=pd.concat([l2,update_df])

In [650]:
new_df.columns

Index(['date', 'date.1', 'SCI', 'staff_positive', 'staff_negative',
       'staff_pending', 'staff_death', 'staff_recovered',
       'incarcerated_person_positive', 'incarcerated_person_negative',
       'incarcerated_person_pending', 'incarcerated_person_death',
       'incarcerated_person_recovered', 'test_transfer',
       'test_transfer_positive', 'test_release', 'test_release_positive',
       'test_hospital', 'test_hospital_positive', 'test_surveilance',
       'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'date.1.1',
       'staff_positive_new', 'staff_negative_new', 'staff_pending_new',
       'staff_death_new', 'staff_recovered_new',
       'incarcerated_person_positive_new', 'incarcerated_person_negative_new',
       'incarcerated_person_pending_new', 'incarcerated_person_death_new',
       'incarcerated_person_recovered_new', 'test_transfer_new',
       'test_transfer_positive_new', 'test_release_new',
       'test

In [686]:
new_df

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,2021-1-8,,Retreat,,,,,,13.0,1048.0,...,,,,,,,,,,
22,2021-1-8,,Rockview,,,,,,117.0,788.0,...,,,,,,,,,,
23,2021-1-8,,Smithfield,,,,,,459.0,728.0,...,,,,,,,,,,
24,2021-1-8,,Somerset,,,,,,604.0,1418.0,...,,,,,,,,,,


In [687]:
def add_deltas(all_data):
    doc2_df = all_data.copy()
    doc2_df=doc2_df.drop(columns='date').reset_index()

    print(doc2_df.head())

    exclude_cols = ['SCI', 'date', 'date.1', 'date.1.1']

    cols_to_use = [c for c in doc2_df.columns if c not in exclude_cols]

    for col in cols_to_use:
       print('Calculating delta for', col)
       doc2_df[f'{col}_new'] = doc2_df.groupby('SCI')[col].diff()
    
    doc2_df=doc2_df.set_index(doc2_df['date'])

    return doc2_df

In [688]:
new_df=new_df.set_index(pd.DatetimeIndex(new_df['date']))

#new2_df = add_deltas(new_df)

In [689]:
new_df2 = add_deltas(new_df.drop(columns=['date.1']))

        date                SCI  staff_positive  staff_negative  \
0 2020-04-07           Rockview             NaN             1.0   
1 2020-04-07             Albion             NaN             5.0   
2 2020-04-07    Benner Township             NaN             4.0   
3 2020-04-07  Cambridge Springs             NaN             NaN   
4 2020-04-07          Camp Hill             2.0             1.0   

   staff_pending  staff_death  staff_recovered  incarcerated_person_positive  \
0            2.0          NaN              NaN                           NaN   
1            2.0          NaN              NaN                           NaN   
2            1.0          NaN              NaN                           NaN   
3            2.0          NaN              NaN                           NaN   
4            NaN          NaN              NaN                           NaN   

   incarcerated_person_negative  incarcerated_person_pending  ...  \
0                           NaN                

In [690]:
agg=latest_df.copy()

In [691]:
agg

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3104,2021-01-07,2021-01-07,Retreat,4.0,10.0,1.0,,4.0,13.0,1048.0,...,,,,,,,,,,
3105,2021-01-07,2021-01-07,Rockview,181.0,124.0,31.0,,197.0,116.0,788.0,...,,,,,,,,,,
3106,2021-01-07,2021-01-07,Smithfield,175.0,194.0,19.0,,155.0,459.0,728.0,...,,,,,,,,,,
3107,2021-01-07,2021-01-07,Somerset,163.0,186.0,15.0,,140.0,604.0,1418.0,...,,,,,,,,,,


In [692]:
cols_to_drop = [c for c in agg.columns if c.endswith('_new') or c.endswith('_D') or c.startswith('date.1')]

agg = agg.drop(columns=cols_to_drop)

new_df = pd.concat([agg,update_df])

new_df=new_df.set_index(pd.DatetimeIndex(new_df['date']))

new2_df = add_deltas(new_df)


        date                SCI  staff_positive  staff_negative  \
0 2020-04-07           Rockview             NaN             1.0   
1 2020-04-07             Albion             NaN             5.0   
2 2020-04-07    Benner Township             NaN             4.0   
3 2020-04-07  Cambridge Springs             NaN             NaN   
4 2020-04-07          Camp Hill             2.0             1.0   

   staff_pending  staff_death  staff_recovered  incarcerated_person_positive  \
0            2.0          NaN              NaN                           NaN   
1            2.0          NaN              NaN                           NaN   
2            1.0          NaN              NaN                           NaN   
3            2.0          NaN              NaN                           NaN   
4            NaN          NaN              NaN                           NaN   

   incarcerated_person_negative  incarcerated_person_pending  ...  \
0                           NaN                

In [693]:
new2_df.columns

Index(['date', 'SCI', 'staff_positive', 'staff_negative', 'staff_pending',
       'staff_death', 'staff_recovered', 'incarcerated_person_positive',
       'incarcerated_person_negative', 'incarcerated_person_pending',
       'incarcerated_person_death', 'incarcerated_person_recovered',
       'test_transfer', 'test_transfer_positive', 'test_release',
       'test_release_positive', 'test_hospital', 'test_hospital_positive',
       'test_surveilance', 'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'staff_positive_new',
       'staff_negative_new', 'staff_pending_new', 'staff_death_new',
       'staff_recovered_new', 'incarcerated_person_positive_new',
       'incarcerated_person_negative_new', 'incarcerated_person_pending_new',
       'incarcerated_person_death_new', 'incarcerated_person_recovered_new',
       'test_transfer_new', 'test_transfer_positive_new', 'test_release_new',
       'test_release_positive_new', 'test_hospit

In [694]:
new2_df.shape

(3135, 44)

In [695]:
latest_df.shape

(3109, 46)

In [696]:
new2_df.head()

Unnamed: 0_level_0,date,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,incarcerated_person_pending,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,,...,,,,,,,,,,
2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,,...,,,,,,,,,,
2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,,...,,,,,,,,,,
2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,,...,,,,,,,,,,
2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,1.0,...,,,,,,,,,,


In [697]:
new2_df.tail()

Unnamed: 0_level_0,date,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,incarcerated_person_pending,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-08,2021-01-08,Retreat,,,,,,13.0,1048.0,,...,,,,,,,,,,
2021-01-08,2021-01-08,Rockview,,,,,,117.0,788.0,164.0,...,,,,,,,,,,
2021-01-08,2021-01-08,Smithfield,,,,,,459.0,728.0,695.0,...,,,,,,,,,,
2021-01-08,2021-01-08,Somerset,,,,,,604.0,1418.0,956.0,...,,,,,,,,,,
2021-01-08,2021-01-08,Waymart,,,,,,740.0,3136.0,73.0,...,,,,,,,,,,


In [698]:
new2_df.to_csv('/Users/mattodonnell/Downloads/PA_DOC_testing_data_SCRAPED.csv')

In [423]:
new2_df.columns

Index(['date', 'SCI', 'staff_positive', 'staff_negative', 'staff_pending',
       'staff_death', 'staff_recovered', 'incarcerated_person_positive',
       'incarcerated_person_negative', 'incarcerated_person_pending',
       'incarcerated_person_death', 'incarcerated_person_recovered',
       'test_transfer', 'test_transfer_positive', 'test_release',
       'test_release_positive', 'test_hospital', 'test_hospital_positive',
       'test_surveilance', 'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'staff_positive_new',
       'staff_negative_new', 'staff_pending_new', 'staff_death_new',
       'staff_recovered_new', 'incarcerated_person_positive_new',
       'incarcerated_person_negative_new', 'incarcerated_person_pending_new',
       'incarcerated_person_death_new', 'incarcerated_person_recovered_new',
       'test_transfer_new', 'test_transfer_positive_new', 'test_release_new',
       'test_release_positive_new', 'test_hospit

In [424]:
new2_df.loc['2020-12-28'][['SCI','incarcerated_person_positive','incarcerated_person_positive_new']]

Unnamed: 0_level_0,SCI,incarcerated_person_positive,incarcerated_person_positive_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-28,Albion,10.0,0.0
2020-12-28,Benner Township,143.0,2.0
2020-12-28,Cambridge Springs,679.0,5.0
2020-12-28,Camp Hill,200.0,2.0
2020-12-28,Central Office,,
2020-12-28,Chester,168.0,1.0
2020-12-28,Coal Township,196.0,1.0
2020-12-28,Dallas,1180.0,145.0
2020-12-28,Fayette,21.0,5.0
2020-12-28,Forest,24.0,0.0
