In [2]:
import re
import datetime as dt
import logging
from urllib import request
from urllib.error import HTTPError, URLError

import pandas as pd
from bs4 import BeautifulSoup as bs

import boto3
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None, ExtraArgs={'ACL': 'public-read'}):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    print(f'{object_name} added to {bucket}.')
    return True

YESTERDAY = dt.datetime.now() + dt.timedelta(days = -2)
MONTHDAYYEAR = "%m-%d-%Y"

county = re.compile(r'^(\w*[\s\w]*)\sCOUNTY.*')
town = re.compile(r'.\s?([\w\s]*):\s(\d+\,\d+|\d+).*')
deaths = re.compile(
            r'.*\w*:\s?.*(?:with)?\s(\d+\,\d+|\d+)\s(?:death|fatalitie|fatality|who died)s?.*',
            re.IGNORECASE
        )
recovered = re.compile(
            r'.*(\d+\,\d+|\d+)\s(?:cleared from quarantine|who recovered'\
            +r'|have recovered|recovered).*',
            re.IGNORECASE
            )

url = f"https://www.nj.com/coronavirus/{YESTERDAY.strftime('%Y')}/{YESTERDAY.strftime('%m')}/"+\
        f"where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-"+\
        f"{YESTERDAY.strftime('%B').lower()}-{YESTERDAY.strftime('%-d')}-{YESTERDAY.strftime('%Y')}.html"

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' +\
           '(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }

def open_url(url, hdr, fix_html = False, regex = r'<div class="rawhtml".*\(\);'):
    if not fix_html:
        try:
            req = request.Request(url, headers=hdr)
            html = bs(request.urlopen(url), 'lxml')
            print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nSuccessfully opened {url}')
        except HTTPError as e:
            print(f'Date: {YESERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')
            quit()
        except URLError as e:
            print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')
            quit()
    else:
        try:
            req = request.Request(url, headers=hdr)
            response = request.urlopen(req)

            html = bs(re.sub(
                        regex,
                        '',
                        response.read().decode('utf-8')),
                      'lxml'
                      )
            print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nSuccessfully opened {url} and repaired')
        except HTTPError as e:
            print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')
            quit()
        except URLError as e:
            print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')
            quit()
    return html
    
html = open_url(url, hdr, fix_html = True)
html.find_all('p', ['article__paragraph', 'article__paragraph--left'])

current_county = ''
towns = []

for p in html.find_all('p', ['article__paragraph', 'article__paragraph--left']):
    if county.match(p.text):
        current_county = county.match(p.text).group(1).title()
        print(current_county)
    if town.match(p.text):
        print(p.text)
        town_name = town.match(p.text).group(1)
        town_ct = int(town.match(p.text).group(2).replace(',', ''))
        death_ct = 0
        recovered_ct = 0
        if deaths.match(p.text):
            death_ct = int(deaths.match(p.text).group(1).replace(',', ''))
            #print(p.text)
        if recovered.match(p.text):
            recovered_ct = int(recovered.match(p.text).group(1).replace(',', ''))
            #print(p.text)
        towns.append(
                    [current_county,
                     town_name,
                     town_ct,
                     death_ct,
                     recovered_ct
                    ])

towns = pd.DataFrame(towns,
                     columns=['County', 'City', 'Cases',
                              'Deaths', 'Recoveries']
                    )
print('\nTowns:\n')
print(towns.head())

zips = pd.read_csv('s3://athenedyne-covid-19/NJzips.csv',
                   converters = {'Zip Code': str})
print('\nZIPs:\n')
print(zips.head())

output = towns.merge(zips, how='left', left_on=['City','County'], right_on=['City','County'])

print('\nMerge:\n')
print(output.head())

shared_zips = output[['City','County','Cases']].groupby(
        ['City','County']).count().rename(columns={'Cases':'Shared ZIPs'})
cases_w_shared_zips = output.merge(shared_zips, how='left',
                                   left_on=['City','County'],
                                   right_on=['City','County'])

cases_w_shared_zips['Adjusted Cases'] = round(cases_w_shared_zips['Cases'] / cases_w_shared_zips['Shared ZIPs'],1)
print('\nAdjusted by shared ZIP\n')
print(cases_w_shared_zips.head())

Date: 04-15-2020
Successfully opened https://www.nj.com/coronavirus/2020/04/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-april-15-2020.html and repaired
Atlantic
• Absecon: 13
• Atlantic City: 46 and 3 deaths
• Brigantine: 6 and 1 death
• Buena Borough: 11
• Buena Vista Township: 4
• Egg Harbor City: 5
• Egg Harbor Township: 66 and 6 deaths
• Estell Manor : 1
• Folsom: 3
• Galloway: 44
• Hamilton: 31
• Hammonton: 13
• Linwood: 16
• Longport: 2
• Margate: 3
• Mullica: 3
• Northfield: 5
• Pleasantville: 27 with 1 death
• Somers Point: 10
• Ventnor: 14
• Weymouth: 4
Bergen
• Allendale: 36
• Alpine: 15
• Bergenfield: 482
• Bogota: 123
• Carlstadt: 60
• Cliffside Park: 280
• Closter: 45
• Cresskill: 62
• Demarest: 39
• Dumont: 193
• East Rutherford: 88
•Edgewater: 87
• Elmwood Park: 314
• Emerson: 104
• Englewood: 478
• Englewood Cliffs: 33
• Fair Lawn: 389
• Fairview: 201
• Fort Lee: 251
• Franklin Lakes: 107
• Garfield: 419
• Glen Rock: 69
• Hackensack: 743
•


ZIPs:

  Zip Code        City     County
0    07001      Avenel  Middlesex
1    07002     Bayonne     Hudson
2    07003  Bloomfield      Essex
3    07004   Fairfield      Essex
4    07005     Boonton     Morris

Merge:

     County           City  Cases  Deaths  Recoveries Zip Code
0  Atlantic        Absecon     13       0           0    08201
1  Atlantic        Absecon     13       0           0    08205
2  Atlantic  Atlantic City     46       3           0    08401
3  Atlantic  Atlantic City     46       3           0    08404
4  Atlantic  Atlantic City     46       3           0    08405

Adjusted by shared ZIP

     County           City  Cases  Deaths  Recoveries Zip Code  Shared ZIPs  \
0  Atlantic        Absecon     13       0           0    08201            2   
1  Atlantic        Absecon     13       0           0    08205            2   
2  Atlantic  Atlantic City     46       3           0    08401            3   
3  Atlantic  Atlantic City     46       3           0    084

In [None]:
cases_w_shared_zips.to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv', index=False)
output[['Zip Code', 'City', 'Cases']].to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv', index=False)
output.drop_duplicates(['City','County'], keep='first').groupby('Zip Code').\
    sum().to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv', index=False)

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv',
            'athenedyne-covid-19',
           f'Complete/{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv')
            
upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv',
           'athenedyne-covid-19',
           f'Cases/{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv')

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv',
           'athenedyne-covid-19',
           f'ZIPs/{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv')

In [None]:
if output[output['Zip Code'].isna()][['Zip Code', 'City', 'County']].shape[0] > 0:
    output[output['Zip Code'].isna()][['Zip Code', 'City', 'County']].\
        to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv',
              index=False)
    upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv',
               'athenedyne-covid-19',
               f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv')