In [75]:
import re
import datetime as dt
import logging
from urllib import request
from urllib.error import HTTPError, URLError

import pandas as pd
from bs4 import BeautifulSoup as bs

import boto3
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None, ExtraArgs={'ACL': 'public-read'}):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    print(f'{object_name} added to {bucket}.')
    return True

DAYS = dt.date.today() - dt.date(2020,3,25)
NOW = dt.date.today()
YESTERDAY = dt.date.today() + dt.timedelta(days = -DAYS.days)
MONTHDAYYEAR = "%m-%d-%Y"

county = re.compile(r'^(\w*[\s\w]*)\sCOUNTY.*\((?:state reports?\s?)?(\d+\,\d+|\d+)\s(?:cases?|with).*')
town = re.compile(r'.[\*\s]?([\w\s]*):\s(\d+\,\d+|\d+).*')
deaths = re.compile(
            r'.*\w*:\s?.*(?:with)?\s(\d+\,\d+|\d+)\s(?:death|fatalitie|fatality|who died)s?.*',
            re.IGNORECASE
        )
recovered = re.compile(
            r'.*(\d+\,\d+|\d+)\s(?:cleared from quarantine|who recovered'\
            +r'|have recovered|recovered).*',
            re.IGNORECASE
            )

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' +\
           '(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }

towns = []
counties = []

def get_url(day):
    return f"https://www.nj.com/coronavirus/{day.strftime('%Y')}/{day.strftime('%m')}/"+\
        f"where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-"+\
        f"{day.strftime('%B').lower()}-{day.strftime('%-d')}-{day.strftime('%Y')}.html"


def open_url(day, hdr, fix_html = False, regex = r'<div class="rawhtml".*\(\);'):
    url = get_url(day)
    if fix_html:
        try:
            req = request.Request(url, headers=hdr)
            html = bs(request.urlopen(url), 'lxml')
            print(f'Date: {day.strftime(MONTHDAYYEAR)} Successfully opened and repaired')
        except HTTPError as e:
            print(f'Date: {day.strftime(MONTHDAYYEAR)}\nError trying to open {url}\n{e}')
            return None
        except URLError as e:
            print(f'Date: {day.strftime(MONTHDAYYEAR)}\nError trying to open {url}\n{e}')
            return None
    else:
        try:
            req = request.Request(url, headers=hdr)
            response = request.urlopen(req)

            html = bs(re.sub(
                        regex,
                        '',
                        response.read().decode('utf-8')),
                      'lxml'
                      )
            print(f'Date: {day.strftime(MONTHDAYYEAR)} Successfully opened')
        except HTTPError as e:
            print(f'Date: {day.strftime(MONTHDAYYEAR)}\nError trying to open {url}\n{e}')
            return None
        except URLError as e:
            print(f'Date: {day.strftime(MONTHDAYYEAR)}\nError trying to open {url}\n{e}')
            return None
    return html


def scraper(day, hdr, fix_html):
    current_html = open_url(day, hdr, fix_html)
    if not current_html:
        return None
    
    current_county = ''
    
    for p in current_html.find_all('p', ['article__paragraph', 'article__paragraph--left']):
        if county.match(p.text):
            current_county = county.match(p.text).group(1).title()
            counties.append(
                [day.strftime(MONTHDAYYEAR),
                 current_county,
                 int(county.match(p.text).group(2).replace(',', ''))]
                )
        if town.match(p.text):
            town_name = town.match(p.text).group(1)
            town_ct = int(town.match(p.text).group(2).replace(',', ''))
            death_ct = 0
            recovered_ct = 0
            if deaths.match(p.text):
                death_ct = int(deaths.match(p.text).group(1).replace(',', ''))
            if recovered.match(p.text):
                recovered_ct = int(recovered.match(p.text).group(1).replace(',', ''))
            towns.append(
                        [day.strftime(MONTHDAYYEAR),
                         current_county,
                         town_name,
                         town_ct,
                         death_ct,
                         recovered_ct
                        ])
    
    
for delta in range(DAYS.days+1):
    current_day = NOW + dt.timedelta(days=-delta)
    if current_day in [dt.date(2020,3,27), dt.date(2020,3,28), dt.date(2020,4,18)]:
        fix_html = True
    else:
        fix_html = False
    scraper(current_day, hdr, fix_html)

towns = pd.DataFrame(towns,
                     columns=['Date','County', 'City', 'Cases',
                              'Deaths', 'Recoveries']
                    )
counties = pd.DataFrame(counties,
                      columns=['Date', 'County', 'Cases'])

print('\nTowns data points: ', towns.shape[0])
print('\nCounties data points: ', counties.shape[0])

Date: 05-08-2020
Error trying to open https://www.nj.com/coronavirus/2020/05/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-may-8-2020.html
HTTP Error 404: Not Found
Date: 05-07-2020 Successfully opened
Date: 05-06-2020 Successfully opened
Date: 05-05-2020 Successfully opened
Date: 05-04-2020 Successfully opened
Date: 05-03-2020 Successfully opened
Date: 05-02-2020 Successfully opened
Date: 05-01-2020
Error trying to open https://www.nj.com/coronavirus/2020/05/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-may-1-2020.html
HTTP Error 404: Not Found
Date: 04-30-2020 Successfully opened
Date: 04-29-2020
Error trying to open https://www.nj.com/coronavirus/2020/04/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-april-29-2020.html
HTTP Error 404: Not Found
Date: 04-28-2020 Successfully opened
Date: 04-27-2020 Successfully opened
Date: 04-26-2020 Successfully opened
Date: 04-25-2020 Successfully opened
Date: 04

In [63]:
html = open_url(dt.datetime(2020,4,18),hdr,True)

Date: 04-18-2020 Successfully opened and repaired


In [64]:
for i in html.find_all('p', ['article__paragraph', 'article__paragraph--left']):
    if county.match(i.text):
        print(county.match(i.text).groups())

('BERGEN', '12,163')
('BURLINGTON', '1,456')
('CAMDEN', '1,918')
('CAPE MAY', '199')
('CUMBERLAND', '272')
('ESSEX', '9,901')
('GLOUCESTER', '683')
('HUDSON', '9,956')
('HUNTERDON', '391')
('MERCER', '2,215')
('MIDDLESEX', '7,624')
('MONMOUTH', '4,528')
('MORRIS', '3,984')
('OCEAN', '4,548')
('PASSAIC', '7,936')
('SALEM', '105')
('SOMERSET', '2,283')
('SUSSEX', '626')
('UNION', '8,959')
('WARREN', '543')


In [77]:
counties

Unnamed: 0,Date,County,Cases
0,05-07-2020,Atlantic,1302
1,05-07-2020,Bergen,16609
2,05-07-2020,Burlington,3367
3,05-07-2020,Camden,4479
4,05-07-2020,Cape May,401
...,...,...,...
828,03-25-2020,Salem,1
829,03-25-2020,Somerset,117
830,03-25-2020,Sussex,27
831,03-25-2020,Union,262


In [78]:
towns[towns.Date =='04-18-2020']

Unnamed: 0,Date,County,City,Cases,Deaths,Recoveries
9115,04-18-2020,,Absecon,15,0,0
9116,04-18-2020,,Atlantic City,49,3,0
9117,04-18-2020,,Brigantine,7,1,0
9118,04-18-2020,,Buena Borough,14,0,0
9119,04-18-2020,,Buena Vista Township,4,0,0
...,...,...,...,...,...,...
9649,04-18-2020,Warren,Phillipsburg,76,0,5
9650,04-18-2020,Warren,Pohatcong,11,0,1
9651,04-18-2020,Warren,Washington Borough,25,0,2
9652,04-18-2020,Warren,Washington Township,27,0,5


In [79]:
counties[counties.Date == '04-18-2020']

Unnamed: 0,Date,County,Cases
351,04-18-2020,Bergen,12163
352,04-18-2020,Burlington,1456
353,04-18-2020,Camden,1918
354,04-18-2020,Cape May,199
355,04-18-2020,Cumberland,272
356,04-18-2020,Essex,9901
357,04-18-2020,Gloucester,683
358,04-18-2020,Hudson,9956
359,04-18-2020,Hunterdon,391
360,04-18-2020,Mercer,2215


In [80]:
counties[['Date','County']].groupby('Date').count()

Unnamed: 0_level_0,County
Date,Unnamed: 1_level_1
03-25-2020,21
03-26-2020,21
03-27-2020,19
03-28-2020,13
03-29-2020,11
03-30-2020,18
03-31-2020,18
04-01-2020,21
04-02-2020,21
04-03-2020,19


In [81]:
towns[['Date','County','City']].groupby('Date').count()

Unnamed: 0_level_0,County,City
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
03-25-2020,175,175
03-26-2020,218,218
03-27-2020,235,235
03-28-2020,310,310
03-29-2020,236,236
03-30-2020,374,374
03-31-2020,427,427
04-01-2020,381,381
04-02-2020,459,459
04-03-2020,466,466


In [None]:
cases_w_shared_zips.to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv', index=False)
output[['Zip Code', 'City', 'Cases']].to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv', index=False)
output.drop_duplicates(['City','County'], keep='first').groupby('Zip Code').\
    sum().to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv', index=False)

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv',
            'athenedyne-covid-19',
           f'Complete/{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv')
            
upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv',
           'athenedyne-covid-19',
           f'Cases/{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv')

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv',
           'athenedyne-covid-19',
           f'ZIPs/{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv')

In [None]:
if output[output['Zip Code'].isna()][['Zip Code', 'City', 'County']].shape[0] > 0:
    output[output['Zip Code'].isna()][['Zip Code', 'City', 'County']].\
        to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv',
              index=False)
    upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv',
               'athenedyne-covid-19',
               f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv')