# Selenium DOC dashboard scraper

THis notebook documents and executes the collection of daily COVID-19 data from the Pennsylvania DOC COVID-19 dashboard. Data is scraped from the DOC dashboard using Selenium and is then processed to conform to a standard dataframe format. 

Data fields collected in this notebook are only for incarcerated people (page 5 of the DOC dashboard), including positive tests, negative tests, pending tests, recovered cases and incarcerated person deaths. 


### Data collection procedure:
1. route selenium driver to DOC dashboard url
2. navigate driver to page 5 of dashboard
3. select date and SCI from dropdown menus
4. isolate and scrape relevant data fields
5. format and store relevant data fields

**NOTE** functions are defined out of order to allow for proper order of operations in loops

In [206]:
# import libraries

import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib import request
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys

import time
import re

### 1. route selenium driver to DOC dashboard URL

In [210]:
# start up selenium
dash_url = "https://app.powerbigov.us/view?r=eyJrIjoiNWQ5YTQ4ZWUtY2NjMi00ZWRhLTgyNWQtYzAzNzc5NmYwMGIyIiwidCI6IjQxOGUyODQxLTAxMjgtNGRkNS05YjZjLTQ3ZmM1YTlhMWJkZSJ9"
driver = webdriver.Safari()
driver.get(dash_url)

# some div selectors
sci_dropdown_scroll_area = 'document.querySelector("body > div:nth-child(46) > div.slicer-dropdown-content > div > div.slicerBody > div > div.scrollbar-inner.scroll-content.scroll-scrolly_visible")'
date_dropdown_scroll_area = 'document.querySelector("body > div:nth-child(47) > div.slicer-dropdown-content > div > div.slicerBody > div > div.scrollbar-inner.scroll-content.scroll-scrolly_visible")'

In [211]:
# tracking data
all_data = []
bad_dates = []
date_list = []

### 2. navigate driver to page 5 of dashboard

In [None]:
def go_to_page_5():
    for i in range(4):
        driver.execute_script("document.querySelector('[title=\"Next Page\"]').click()")


### 4. Isolate and scrape content of svg data

In [None]:
def get_svg_content():
    result = None
    while result is None:
        try:
            d = driver.execute_script(' return document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(6) > transform > div > div:nth-child(3) > div > visual-modern > div > svg")')
            content = [i.get_attribute('innerHTML') for i in d.find_elements_by_tag_name("*")]
            return content
        except:
             pass
    

### 3. select date and SCI from dropdown menus

In [None]:
# open dropdowns

def open_dropdowns():
    # open date dropdown
    driver.execute_script('document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(7) > transform > div > div:nth-child(3) > div > visual-modern > div > div > div.slicer-content-wrapper > div > i").click()')
    # open sci dropdown
    driver.execute_script('document.querySelector("#pvExplorationHost > div > div > exploration > div > explore-canvas-modern > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost > visual-container-repeat > visual-container-modern:nth-child(5) > transform > div > div:nth-child(3) > div > visual-modern > div > div > div.slicer-content-wrapper > div").click()')

In [7]:
def roll_through_SCI(date):

    time.sleep(0.2)
    
    height = 200
    sci_list = []
    for d_ in range(4):
        
        # scroll to next portion
        driver.execute_script(f'{sci_dropdown_scroll_area}.scrollTop = {height*d_}')
        
        time.sleep(0.1)
        
        # parent node
        sci_scroll_area = driver.execute_script(f'return {sci_dropdown_scroll_area}')
        
        # sometimes it misses the selection, so try again
        try:
            current_sci = sci_scroll_area.find_elements_by_class_name('slicerText')
            current_sci[0].get_attribute('title')
        except:
            current_sci = sci_scroll_area.find_elements_by_class_name('slicerText')

        # for ever sci click it and run the svg collector
        for i in current_sci:
            sci = i.get_attribute('title')
            if sci not in sci_list:
                sci_list.append(sci)
                driver.execute_script(f"document.querySelector('[title=\"{sci}\"]').click()")
                
                # get data
                sci_dict = {"content":get_svg_content()}
                sci_dict['sci'] = sci
                sci_dict['date'] = date
                
                # save data
                all_data.append(sci_dict)


In [8]:

def roll_through_dates():
    height = 180
    for d in range(50):
        # 50 seems to capture everything
        
        # scroll to the next portion
        driver.execute_script(f'{date_dropdown_scroll_area}.scrollTop = {height*d}')
        
        # parent node
        date_scroll_area = driver.execute_script(f'return {date_dropdown_scroll_area}')

        # sometimes it misses the selection, so try again
        try:
            current_dates = date_scroll_area.find_elements_by_class_name('slicerText')
            current_dates[0].get_attribute('title')
        except:

            current_dates = date_scroll_area.find_elements_by_class_name('slicerText')
        
        # go through every date and click if it isn't already recorded
        for i in current_dates:
            date = i.get_attribute('title')
            if date not in date_list:
                date_list.append(date)
                driver.execute_script(f"document.querySelector('[title=\"{date}\"]').click()")

                # roll through all sci
                roll_through_SCI(date)
            if i == "(Blank)": # stop everything for messy code below
                return True


In [212]:
go_to_page_5() # go to page 5

In [213]:
open_dropdowns() # open the dropdown menus

### DANGER, this is bad code and is embarrassing. It was stopped and started multiple times to get all data. bad bad bad 

In [None]:
done = None
cnt=0
while not done:
    try:
        val = roll_through_dates()
        cnt+=1
        if cnt>10:
            done = True
    except Exception as E:
        print(E)
        pass


Message: A JavaScript exception occured: null is not an object (evaluating 'document.querySelector('[title="BENNER TOWNSHIP"]').click')

Message: 

Message: 



In [45]:
pd.DataFrame(all_data).to_csv('../data/scraped_Dashboard_incarceratedData.csv')

In [14]:
all_data

[]

In [214]:
date='12/14/2020'

In [215]:
driver.execute_script(f"document.querySelector('[title=\"{date}\"]').click()")

In [216]:
roll_through_SCI(date)

In [49]:
from xml.etree.ElementTree import XML, fromstring

In [179]:
'<svg>{}</svg>'.format(all_data[6]['content'][1])

'<svg><g class="slices"><path class="slice setFocusRing" tabindex="0" focusable="true" aria-label="NEGATIVE 2,785 (50.1%)." role="option" d="M1.1401684039411191e-14,-186.2036310771316A186.2036310771316,186.2036310771316,0,1,1,-1.1575272727592003,186.20003318184814L-0.5787636363796002,93.10001659092407A93.1018155385658,93.1018155385658,0,1,0,5.700842019705596e-15,-93.1018155385658Z" style="fill: rgb(0, 177, 255); fill-opacity: 1; stroke-opacity: 1; stroke: rgb(255, 255, 255); stroke-width: 0px; stroke-dasharray: 0px, 972.3020568263667px, 93.1018155385658px;"></path><path class="slice setFocusRing" tabindex="0" focusable="true" aria-label="POSITIVE 189 (3.4%)." role="option" d="M-1.1575272727592003,186.20003318184814A186.2036310771316,186.2036310771316,0,0,1,-40.60571303511799,181.72222840153094L-20.302856517558993,90.86111420076547A93.1018155385658,93.1018155385658,0,0,0,-0.5787636363796002,93.10001659092407Z" style="fill: rgb(214, 69, 80); fill-opacity: 1; stroke-opacity: 1; stroke: rg

In [91]:
elem=fromstring('<svg>{}</svg>'.format(all_data[6]['content'][6]))

In [87]:
for p in elem.findall('g/path'):
    print(p.attrib['aria-label'])

In [224]:
for sci in all_data:
    print(sci['sci'])
    elem=fromstring('<svg>{}</svg>'.format(sci['content'][1]))
    for p in elem.findall('g/path'):
        c, v = p.attrib['aria-label'].split()[:2]
        v=v.replace(',','')
        sci[c]=v
    print()

ALBION

BENNER TOWNSHIP

CAMBRIDGE SPRINGS

CAMP HILL

CENTRAL OFFICE

CHESTER

COAL TOWNSHIP

DALLAS

FAYETTE

FOREST

FRACKVILLE

GREENE

HOUTZDALE

HUNTINGDON

LAUREL HIGHLANDS

MAHANOY

MERCER

MUNCY

PHOENIX

PINE GROVE

QUEHANNA BOOTCAMP

RETREAT

ROCKVIEW

SMITHFIELD

SOMERSET

WAYMART



In [133]:
all_data=[]

In [69]:
p.attrib['aria-label']

'RECOVERED 6 (0.88%).'

In [41]:
all_data[0]['content'][1]

'<g class="slices"><path class="slice setFocusRing" tabindex="0" focusable="true" aria-label="NEGATIVE 471 (69.26%)." role="option" d="M1.1401684039411191e-14,-186.2036310771316A186.2036310771316,186.2036310771316,0,1,1,-174.24381681142125,65.65732654697726L-87.12190840571063,32.82866327348863A93.1018155385658,93.1018155385658,0,1,0,5.700842019705596e-15,-93.1018155385658Z" style="fill: rgb(0, 177, 255); fill-opacity: 1; stroke-opacity: 1; stroke: rgb(255, 255, 255); stroke-width: 0px; stroke-dasharray: 0px, 1308.6474489534962px, 93.1018155385658px;"></path><path class="slice setFocusRing" tabindex="0" focusable="true" aria-label="POSITIVE 10 (1.47%)." role="option" d="M-174.24381681142125,65.65732654697726A186.2036310771316,186.2036310771316,0,0,1,-179.5586172144593,49.30005284317459L-89.77930860722965,24.650026421587295A93.1018155385658,93.1018155385658,0,0,0,-87.12190840571063,32.82866327348863Z" style="fill: rgb(214, 69, 80); fill-opacity: 1; stroke-opacity: 1; stroke: rgb(255, 255

In [28]:
all_data[0]['content'][0].text

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=49593): Max retries exceeded with url: /session/62AE3A97-D714-46F0-B5E7-33559256358F/element/node-9A98FF36-58E7-4E2B-8262-BED93AA12308/text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11d553ba8>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [105]:
df121220=pd.DataFrame(all_data)

In [225]:
df121420=pd.DataFrame(all_data)

In [226]:
df121420

Unnamed: 0,content,sci,date,NEGATIVE,POSITIVE,PENDING,RECOVERED,DEATH
0,"[, <g class=""slices""><path class=""slice setFoc...",ALBION,12/14/2020,471.0,10.0,193.0,6.0,
1,"[, <g class=""slices""><path class=""slice setFoc...",BENNER TOWNSHIP,12/14/2020,392.0,125.0,120.0,7.0,1.0
2,"[, <g class=""slices""><path class=""slice setFoc...",CAMBRIDGE SPRINGS,12/14/2020,556.0,601.0,164.0,4.0,
3,"[, <g class=""slices""><path class=""slice setFoc...",CAMP HILL,12/14/2020,4481.0,149.0,3563.0,77.0,1.0
4,"[, <g class=""slices""></g><use href=""#currentFo...",CENTRAL OFFICE,12/14/2020,,,,,
5,"[, <g class=""slices""><path class=""slice setFoc...",CHESTER,12/14/2020,720.0,168.0,210.0,127.0,4.0
6,"[, <g class=""slices""><path class=""slice setFoc...",COAL TOWNSHIP,12/14/2020,2785.0,190.0,2441.0,144.0,
7,"[, <g class=""slices""><path class=""slice setFoc...",DALLAS,12/14/2020,554.0,484.0,303.0,64.0,9.0
8,"[, <g class=""slices""><path class=""slice setFoc...",FAYETTE,12/14/2020,466.0,9.0,241.0,,
9,"[, <g class=""slices""><path class=""slice setFoc...",FOREST,12/14/2020,386.0,13.0,446.0,,1.0


In [227]:
update_df=df121420

In [228]:
update_df=update_df.drop(columns='content')
update_df['sci']

0                ALBION
1       BENNER TOWNSHIP
2     CAMBRIDGE SPRINGS
3             CAMP HILL
4        CENTRAL OFFICE
5               CHESTER
6         COAL TOWNSHIP
7                DALLAS
8               FAYETTE
9                FOREST
10           FRACKVILLE
11               GREENE
12            HOUTZDALE
13           HUNTINGDON
14     LAUREL HIGHLANDS
15              MAHANOY
16               MERCER
17                MUNCY
18              PHOENIX
19           PINE GROVE
20    QUEHANNA BOOTCAMP
21              RETREAT
22             ROCKVIEW
23           SMITHFIELD
24             SOMERSET
25              WAYMART
Name: sci, dtype: object

In [229]:
update_df['sci']=update_df['sci'].str.title()

In [230]:
update_df

Unnamed: 0,sci,date,NEGATIVE,POSITIVE,PENDING,RECOVERED,DEATH
0,Albion,12/14/2020,471.0,10.0,193.0,6.0,
1,Benner Township,12/14/2020,392.0,125.0,120.0,7.0,1.0
2,Cambridge Springs,12/14/2020,556.0,601.0,164.0,4.0,
3,Camp Hill,12/14/2020,4481.0,149.0,3563.0,77.0,1.0
4,Central Office,12/14/2020,,,,,
5,Chester,12/14/2020,720.0,168.0,210.0,127.0,4.0
6,Coal Township,12/14/2020,2785.0,190.0,2441.0,144.0,
7,Dallas,12/14/2020,554.0,484.0,303.0,64.0,9.0
8,Fayette,12/14/2020,466.0,9.0,241.0,,
9,Forest,12/14/2020,386.0,13.0,446.0,,1.0


In [269]:
latest_df = pd.read_csv('https://raw.githubusercontent.com/jmparelman/PA-SCI_COVID19/main/data/latest_data/PA_DOC_testing_data_TEMP.csv')

In [251]:
latest_df.columns

Index(['date', 'date.1', 'SCI', 'staff_positive', 'staff_negative',
       'staff_pending', 'staff_death', 'staff_recovered',
       'incarcerated_person_positive', 'incarcerated_person_negative',
       'incarcerated_person_pending', 'incarcerated_person_death',
       'incarcerated_person_recovered', 'test_transfer',
       'test_transfer_positive', 'test_release', 'test_release_positive',
       'test_hospital', 'test_hospital_positive', 'test_surveilance',
       'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'date.1.1',
       'staff_positive_new', 'staff_negative_new', 'staff_pending_new',
       'staff_death_new', 'staff_recovered_new',
       'incarcerated_person_positive_new', 'incarcerated_person_negative_new',
       'incarcerated_person_pending_new', 'incarcerated_person_death_new',
       'incarcerated_person_recovered_new', 'test_transfer_new',
       'test_transfer_positive_new', 'test_release_new',
       'test

In [252]:
update_df.columns

Index(['SCI', 'date', 'incarcerated_person_negative',
       'incarcerated_person_positive', 'incarcerated_person_pending',
       'incarcerated_person_recovered', 'incarcerated_person_death'],
      dtype='object')

In [270]:
mapcols = {
    'sci': 'SCI', 
    
    'POSITIVE':'incarcerated_person_positive', 
    'NEGATIVE':'incarcerated_person_negative',
    'PENDING':'incarcerated_person_pending', 
    'DEATH':'incarcerated_person_death',
    'RECOVERED':'incarcerated_person_recovered'
    
}

In [271]:
update_df=update_df.rename(columns=mapcols)

In [282]:
def f(s):
    p=s.split('/') 
    return f"{p[2]}-{p[0]}-{p[1]}"

update_df['date']=update_df['date'].apply(f)

In [272]:
l2 = latest_df.copy()

In [273]:
for c in mapcols.values():
    if c!='SCI':
        update_df[c]=update_df[c].astype(float)

In [274]:
l2

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2678,2020-12-11,2020-12-11,Retreat,4.0,10.0,1.0,,4.0,13.0,1048.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2679,2020-12-11,2020-12-11,Rockview,132.0,46.0,21.0,,30.0,94.0,763.0,...,0.0,4.0,4.0,0.0,0.0,1.0,1.0,2.0,2.0,
2680,2020-12-11,2020-12-11,Smithfield,124.0,133.0,20.0,,51.0,86.0,511.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0,8.0,
2681,2020-12-11,2020-12-11,Somerset,112.0,47.0,22.0,,37.0,289.0,645.0,...,0.0,0.0,0.0,0.0,0.0,5.0,5.0,9.0,10.0,


In [257]:
new_df=pd.concat([l2,update_df])

In [258]:
new_df.columns

Index(['date', 'date.1', 'SCI', 'staff_positive', 'staff_negative',
       'staff_pending', 'staff_death', 'staff_recovered',
       'incarcerated_person_positive', 'incarcerated_person_negative',
       'incarcerated_person_pending', 'incarcerated_person_death',
       'incarcerated_person_recovered', 'test_transfer',
       'test_transfer_positive', 'test_release', 'test_release_positive',
       'test_hospital', 'test_hospital_positive', 'test_surveilance',
       'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'date.1.1',
       'staff_positive_new', 'staff_negative_new', 'staff_pending_new',
       'staff_death_new', 'staff_recovered_new',
       'incarcerated_person_positive_new', 'incarcerated_person_negative_new',
       'incarcerated_person_pending_new', 'incarcerated_person_death_new',
       'incarcerated_person_recovered_new', 'test_transfer_new',
       'test_transfer_positive_new', 'test_release_new',
       'test

In [259]:
new_df

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,12/14/2020,,Retreat,,,,,,13.0,1048.0,...,,,,,,,,,,
22,12/14/2020,,Rockview,,,,,,101.0,763.0,...,,,,,,,,,,
23,12/14/2020,,Smithfield,,,,,,103.0,511.0,...,,,,,,,,,,
24,12/14/2020,,Somerset,,,,,,465.0,645.0,...,,,,,,,,,,


In [265]:
def add_deltas(all_data):
    doc2_df = all_data.copy()
    doc2_df=doc2_df.drop(columns='date').reset_index()

    print(doc2_df.head())

    exclude_cols = ['SCI', 'date', 'date.1', 'date.1.1']

    cols_to_use = [c for c in doc2_df.columns if c not in exclude_cols]

    for col in cols_to_use:
       print('Calculating delta for', col)
       doc2_df[f'{col}_new'] = doc2_df.groupby('SCI')[col].diff()
    
    doc2_df=doc2_df.set_index(doc2_df['date'])

    return doc2_df

In [260]:
new_df=new_df.set_index(pd.DatetimeIndex(new_df['date']))

#new2_df = add_deltas(new_df)

In [266]:
new_df2 = add_deltas(new_df.drop(columns=['date.1']))

        date                SCI  staff_positive  staff_negative  \
0 2020-04-07           Rockview             NaN             1.0   
1 2020-04-07             Albion             NaN             5.0   
2 2020-04-07    Benner Township             NaN             4.0   
3 2020-04-07  Cambridge Springs             NaN             NaN   
4 2020-04-07          Camp Hill             2.0             1.0   

   staff_pending  staff_death  staff_recovered  incarcerated_person_positive  \
0            2.0          NaN              NaN                           NaN   
1            2.0          NaN              NaN                           NaN   
2            1.0          NaN              NaN                           NaN   
3            2.0          NaN              NaN                           NaN   
4            NaN          NaN              NaN                           NaN   

   incarcerated_person_negative  incarcerated_person_pending  ...  \
0                           NaN                

In [283]:
agg=latest_df.copy()

In [284]:
agg

Unnamed: 0,date,date.1,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
0,2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,...,,,,,,,,,,
1,2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,...,,,,,,,,,,
2,2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,...,,,,,,,,,,
3,2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,...,,,,,,,,,,
4,2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2678,2020-12-11,2020-12-11,Retreat,4.0,10.0,1.0,,4.0,13.0,1048.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2679,2020-12-11,2020-12-11,Rockview,132.0,46.0,21.0,,30.0,94.0,763.0,...,0.0,4.0,4.0,0.0,0.0,1.0,1.0,2.0,2.0,
2680,2020-12-11,2020-12-11,Smithfield,124.0,133.0,20.0,,51.0,86.0,511.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0,8.0,
2681,2020-12-11,2020-12-11,Somerset,112.0,47.0,22.0,,37.0,289.0,645.0,...,0.0,0.0,0.0,0.0,0.0,5.0,5.0,9.0,10.0,


In [285]:
cols_to_drop = [c for c in agg.columns if c.endswith('_new') or c.endswith('_D') or c.startswith('date.1')]

agg = agg.drop(columns=cols_to_drop)

new_df = pd.concat([agg,update_df])

new_df=new_df.set_index(pd.DatetimeIndex(new_df['date']))

new2_df = add_deltas(new_df)


        date                SCI  staff_positive  staff_negative  \
0 2020-04-07           Rockview             NaN             1.0   
1 2020-04-07             Albion             NaN             5.0   
2 2020-04-07    Benner Township             NaN             4.0   
3 2020-04-07  Cambridge Springs             NaN             NaN   
4 2020-04-07          Camp Hill             2.0             1.0   

   staff_pending  staff_death  staff_recovered  incarcerated_person_positive  \
0            2.0          NaN              NaN                           NaN   
1            2.0          NaN              NaN                           NaN   
2            1.0          NaN              NaN                           NaN   
3            2.0          NaN              NaN                           NaN   
4            NaN          NaN              NaN                           NaN   

   incarcerated_person_negative  incarcerated_person_pending  ...  \
0                           NaN                

In [286]:
new2_df.columns

Index(['date', 'SCI', 'staff_positive', 'staff_negative', 'staff_pending',
       'staff_death', 'staff_recovered', 'incarcerated_person_positive',
       'incarcerated_person_negative', 'incarcerated_person_pending',
       'incarcerated_person_death', 'incarcerated_person_recovered',
       'test_transfer', 'test_transfer_positive', 'test_release',
       'test_release_positive', 'test_hospital', 'test_hospital_positive',
       'test_surveilance', 'test_surveilance_positive', 'test_symptomatic',
       'test_symptomatic_positive', 'test_miscellaneous', 'staff_positive_new',
       'staff_negative_new', 'staff_pending_new', 'staff_death_new',
       'staff_recovered_new', 'incarcerated_person_positive_new',
       'incarcerated_person_negative_new', 'incarcerated_person_pending_new',
       'incarcerated_person_death_new', 'incarcerated_person_recovered_new',
       'test_transfer_new', 'test_transfer_positive_new', 'test_release_new',
       'test_release_positive_new', 'test_hospit

In [246]:
new2_df.shape

(2684, 44)

In [172]:
latest_df.shape

(2658, 45)

In [173]:
2710-2658

52

In [287]:
new2_df.head()

Unnamed: 0_level_0,date,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,incarcerated_person_pending,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-07,2020-04-07,Rockview,,1.0,2.0,,,,,,...,,,,,,,,,,
2020-04-07,2020-04-07,Albion,,5.0,2.0,,,,,,...,,,,,,,,,,
2020-04-07,2020-04-07,Benner Township,,4.0,1.0,,,,2.0,,...,,,,,,,,,,
2020-04-07,2020-04-07,Cambridge Springs,,,2.0,,,,2.0,,...,,,,,,,,,,
2020-04-07,2020-04-07,Camp Hill,2.0,1.0,,,,,,1.0,...,,,,,,,,,,


In [288]:
new2_df.tail()

Unnamed: 0_level_0,date,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,incarcerated_person_pending,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-14,2020-12-14,Retreat,,,,,,13.0,1048.0,,...,,,,,,,,,,
2020-12-14,2020-12-14,Rockview,,,,,,101.0,763.0,118.0,...,,,,,,,,,,
2020-12-14,2020-12-14,Smithfield,,,,,,103.0,511.0,195.0,...,,,,,,,,,,
2020-12-14,2020-12-14,Somerset,,,,,,465.0,645.0,700.0,...,,,,,,,,,,
2020-12-14,2020-12-14,Waymart,,,,,,349.0,1000.0,396.0,...,,,,,,,,,,


In [289]:
new2_df.to_csv('/Users/mattodonnell/Downloads/PA_DOC_testing_data_SCRAPED.csv')

In [249]:
new2_df

Unnamed: 0_level_0,date,SCI,staff_positive,staff_negative,staff_pending,staff_death,staff_recovered,incarcerated_person_positive,incarcerated_person_negative,incarcerated_person_pending,...,test_transfer_positive_new,test_release_new,test_release_positive_new,test_hospital_new,test_hospital_positive_new,test_surveilance_new,test_surveilance_positive_new,test_symptomatic_new,test_symptomatic_positive_new,test_miscellaneous_new
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NaT,NaT,Rockview,,1.0,2.0,,,,,,...,,,,,,,,,,
NaT,NaT,Albion,,5.0,2.0,,,,,,...,,,,,,,,,,
NaT,NaT,Benner Township,,4.0,1.0,,,,2.0,,...,,,,,,,,,,
NaT,NaT,Cambridge Springs,,,2.0,,,,2.0,,...,,,,,,,,,,
NaT,NaT,Camp Hill,2.0,1.0,,,,,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-14,2020-12-14,Retreat,,,,,,13.0,1048.0,,...,,,,,,,,,,
2020-12-14,2020-12-14,Rockview,,,,,,101.0,763.0,118.0,...,,,,,,,,,,
2020-12-14,2020-12-14,Smithfield,,,,,,103.0,511.0,195.0,...,,,,,,,,,,
2020-12-14,2020-12-14,Somerset,,,,,,465.0,645.0,700.0,...,,,,,,,,,,
