In [117]:
import re
import datetime as dt
import logging
from urllib import request
from urllib.error import HTTPError, URLError

import pandas as pd
from bs4 import BeautifulSoup as bs

import boto3
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None, ExtraArgs={'ACL': 'public-read'}):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

YESTERDAY = dt.datetime.now() + dt.timedelta(days = -1)
MONTHDAYYEAR = "%m-%d-%Y"

county = re.compile(r'^(\w*[\s\w]*)\sCOUNTY.*')
town = re.compile(r'.\s?([\w\s]*):\s(\d+\,\d+|\d+).*')
deaths = re.compile(
            r'.*\w*:\s?.*(?:with)?\s(\d*)\s(?:death|fatalitie|fatality|who died)s?.*',
            re.IGNORECASE
        )
recovered = re.compile(
            r'.*(\d+)\s(?:cleared from quarantine|who recovered'\
            +r'|have recovered|recovered).*',
            re.IGNORECASE
            )

url = f"https://www.nj.com/coronavirus/{YESTERDAY.strftime('%Y')}/{YESTERDAY.strftime('%m')}/"+\
        f"where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-"+\
        f"{YESTERDAY.strftime('%B').lower()}-{YESTERDAY.strftime('%-d')}-{YESTERDAY.strftime('%Y')}.html"

try:
    html = bs(request.urlopen(url))
    print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nSuccessfully opened {url}')
except HTTPError as e:
    print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')
except URLError as e:
    print(f'Date: {YESTERDAY.strftime(MONTHDAYYEAR)}\nError trying to open {url}:\n{e}')

current_county = ''
towns = []

for p in html.findAll('p'):
    if county.match(p.text):
        current_county = county.match(p.text).group(1).title()
        print(current_county)
    if town.match(p.text):
        print(p.text)
        town_name = town.match(p.text).group(1)
        town_ct = int(town.match(p.text).group(2).replace(',', ''))
        death_ct = 0
        recovered_ct = 0
        if deaths.match(p.text):
            death_ct = deaths.match(p.text).group(1)
            #print(p.text)
        if recovered.match(p.text):
            recovered_ct = recovered.match(p.text).group(1)
            #print(p.text)
        towns.append(
                    [current_county,
                     town_name,
                     town_ct,
                     death_ct,
                     recovered_ct
                    ])

towns = pd.DataFrame(towns,
                     columns=['County', 'City', 'Cases',
                              'Deaths', 'Recoveries']
                    )
print(towns.head())

zips = pd.read_csv('s3://athenedyne-covid-19/NJzips.csv',
                   converters = {'Zip Code': str})
print(zips.head())

output = towns.merge(zips, how='left', left_on=['City','County'], right_on=['City','County'])

print(output.head())

Date: 04-09-2020
Successfully opened https://www.nj.com/coronavirus/2020/04/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-april-9-2020.html
Atlantic
• Absecon: 8
• Atlantic City: 27 and 1 death
• Brigantine: 2
• Buena: 9
• Buena Vista: 1
• Egg Harbor City: 4
• Egg Harbor Township: 32 and 3 deaths
• Galloway: 23
• Hamilton: 16
• Hammonton: 7
• Linwood: 7
• Longport: 2
• Margate: 1
• Northfield: 3
• Pleasantville: 15
• Somers Point: 5
• Ventnor: 10
• Weymouth : 1
Bergen
• Allendale: 27
• Alpine: 15
• Bergenfield: 357
• Bogota: 85
• Carlstadt: 41
• Cliffside Park: 192
• Closter: 35
• Cresskill: 43
• Demarest: 22
• Dumont: 129
• East Rutherford: 65
• Edgewater: 67
• Elmwood Park: 201
• Emerson: 52
• Englewood: 350
• Englewood Cliffs: 18
• Fair Lawn: 281
• Fairview: 120
• Fort Lee: 188
• Franklin Lakes: 73
• Garfield: 245
• Glen Rock: 55
• Hackensack: 509
• Harrington Park: 12
• Hasbrouck Heights: 90
• Haworth: 15
• Hillsdale: 64
• Leonia: 52
• Little Ferry: 87


  Zip Code        City     County
0    07001      Avenel  Middlesex
1    07002     Bayonne     Hudson
2    07003  Bloomfield      Essex
3    07004   Fairfield      Essex
4    07005     Boonton     Morris
     County           City  Cases Deaths Recoveries Zip Code
0  Atlantic        Absecon      8      0          0    08201
1  Atlantic        Absecon      8      0          0    08205
2  Atlantic  Atlantic City     27      1          0    08401
3  Atlantic  Atlantic City     27      1          0    08404
4  Atlantic  Atlantic City     27      1          0    08405


In [121]:
towns[towns.City == 'Old Tappan']

Unnamed: 0,County,City,Cases,Deaths,Recoveries
59,Bergen,Old Tappan,42,0,0


In [129]:
output[output.City == 'Old Tappan']

Unnamed: 0,County,City,Cases,Deaths,Recoveries,Zip Code
64,Bergen,Old Tappan,42,0,0,


In [None]:
output.to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv')
output[['Zip Code', 'City', 'Cases']].to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv')
output.groupby('Zip Code').sum().to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv')

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-complete.csv',
            'athenedyne-covid-19')
            
upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-cases.csv',
           'athenedyne-covid-19')

upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-zips.csv',
           'athenedyne-covid-19')

In [128]:
output[output['Zip Code'].isna()][['Zip Code', 'City', 'County']].\
    to_csv(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv')
upload_file(f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv',
           'athenedyne-covid-19',
           f'{YESTERDAY.strftime(MONTHDAYYEAR)}-missing-ZIPs.csv')

True