In [76]:
import camelot
import pandas as pd
import re
import os
import warnings

warnings.filterwarnings('ignore')

In [128]:
def process_dailycount_pdf(fname):
    
    date_RE = re.compile(r'((\d{1,2})/(\d{1,2})/(202\d))')
    
    sci_map = { 'Quehanna': 'Quehanna BC',
                'Coal Twnp': 'Coal Township',
                'Benner': 'Benner Township',
                'Laurel High':'Laurel Highlands'
              }
  
    colmap = {
     'Inmate Active Positive Cases': 'incarcerated_person_active_cases',
     'Asymptomatic Inmate Positive': 'incarcerated_person_active_cases_asymptomatic',
     'Asymptomatic Inmate Positive Cases':'incarcerated_person_active_cases_asymptomatic',
     'Inmate Deaths To Date': 'incarcerated_person_deaths',
     'Employee Active Positive Cases': 'staff_active_cases',
     'Employee Deaths To Date': 'staff_deaths' ,
     'Sci': 'SCI'
    }
    
    
    # 1. process PDF file with camelot
    fpath = f'../data/DOC_daily_counts/{fname}'
    tlist=camelot.read_pdf(fpath)
    
    # 2. extract date from first row heading
    cdate=date_RE.search(tlist[0].df.loc[0][0]).group()
    
    # 3. get column names from 2nd row
    colnames = [c.replace('\n','').title() for c in tlist[0].df.loc[1].to_list()]

    # 4. create data frame 
    tdf=tlist[0].df.loc[2:25]
    tdf.columns = colnames
    tdf.rename(columns=colmap, inplace=True)
    tdf.loc[:,'date']=pd.to_datetime(cdate)
    
    tdf.loc[:,'SCI'] = tdf['SCI'].str.replace('\n','').str.title().replace(sci_map)
    
    #tdf=tdf.set_index(pd.DatetimeIndex(tdf['date']))
    #tdf.drop(columns='date', inplace=True)
    
    return tdf

In [103]:
tdf=process_dailycount_pdf('PA-DOC-COVID-19-Daily-Count-Mon_Feb__1_12:59:53_EST_2021.pdf')

In [104]:
tdf['SCI'].unique()

array(['Albion', 'Benner Township', 'Cambridge Springs', 'Camp Hill',
       'Chester', 'Coal Township', 'Dallas', 'Fayette', 'Forest',
       'Frackville', 'Greene', 'Houtzdale', 'Huntingdon',
       'Laurel Highlands', 'Mahanoy', 'Mercer', 'Muncy', 'Phoenix',
       'Pine Grove', 'Quehanna BC', 'Rockview', 'Smithfield', 'Somerset',
       'Waymart'], dtype=object)

In [105]:
tdf

Unnamed: 0_level_0,SCI,incarcerated_person_active_cases,incarcerated_person_active_cases_asymptomatic,incarcerated_person_deaths,staff_active_cases,staff_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-29,Albion,103,73,2,35,0
2021-01-29,Benner Township,25,10,5,7,0
2021-01-29,Cambridge Springs,4,1,1,5,0
2021-01-29,Camp Hill,9,7,3,27,1
2021-01-29,Chester,3,3,10,9,0
2021-01-29,Coal Township,14,8,0,40,0
2021-01-29,Dallas,3,2,9,21,0
2021-01-29,Fayette,6,5,2,9,1
2021-01-29,Forest,97,83,4,18,0
2021-01-29,Frackville,5,4,0,6,0


In [92]:
colnames

['SCI',
 'Inmate Active Positive Cases',
 'Asymptomatic Inmate Positive',
 'Inmate Deaths to date',
 'Employee Active Positive Cases',
 'Employee Deaths to date']

In [137]:
fnames = [f for f in os.listdir('../data/DOC_daily_counts/') if f.count('COVID')>0]
fnames.sort()

data = []

for fname in fnames:
    print(fname)
    data.append(process_dailycount_pdf(fname))
    
cdf = pd.concat(data)

PA-DOC-COVID-19-Daily-Count-Fri_Jan_29_15:07:03_EST_2021.pdf
PA-DOC-COVID-19-Daily-Count-Mon_Feb__1_12:59:53_EST_2021.pdf


In [138]:
cdf

Unnamed: 0,SCI,incarcerated_person_active_cases,incarcerated_person_active_cases_asymptomatic,incarcerated_person_deaths,staff_active_cases,staff_deaths,date
2,Albion,88,58,2,40,0,2021-01-28
3,Benner Township,28,21,5,5,0,2021-01-28
4,Cambridge Springs,5,2,1,5,0,2021-01-28
5,Camp Hill,10,9,3,29,1,2021-01-28
6,Chester,2,2,10,10,0,2021-01-28
7,Coal Township,14,8,0,36,0,2021-01-28
8,Dallas,1,0,9,21,0,2021-01-28
9,Fayette,5,4,2,9,1,2021-01-28
10,Forest,750,721,4,17,0,2021-01-28
11,Frackville,6,4,0,6,0,2021-01-28


In [139]:
cdf=cdf.set_index('date', drop=True)

In [140]:
cdf

Unnamed: 0_level_0,SCI,incarcerated_person_active_cases,incarcerated_person_active_cases_asymptomatic,incarcerated_person_deaths,staff_active_cases,staff_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-28,Albion,88,58,2,40,0
2021-01-28,Benner Township,28,21,5,5,0
2021-01-28,Cambridge Springs,5,2,1,5,0
2021-01-28,Camp Hill,10,9,3,29,1
2021-01-28,Chester,2,2,10,10,0
2021-01-28,Coal Township,14,8,0,36,0
2021-01-28,Dallas,1,0,9,21,0
2021-01-28,Fayette,5,4,2,9,1
2021-01-28,Forest,750,721,4,17,0
2021-01-28,Frackville,6,4,0,6,0


In [141]:
cdf.to_csv('../data/latest_data/PA_DOC_Daily_Counts.csv')