In [None]:
# Based on https://gist.github.com/AO8/63b9a5acb9fb238cbed13a0269d14137
# Collects and parses data posted by OC Health
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import json

In [None]:
url = "https://occovid19.ochealthinfo.com/coronavirus-in-oc"
out_filename = "oc_data.json"

def get_todays_info(the_url):
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.findAll("table")
    tests_table = tables[0]
    city_table = tables[1]
    demo_table = tables[2]
    return (tests_table, city_table, demo_table)

def html_table_to_list(table):
    list_table = []
    for row in table.findAll("tr"):
        row_table = []
        for cell in row.findAll(["td", "th"]):
            row_table.append(cell.get_text())
        list_table.append(row_table)
    return list_table

def get_todays_lists(the_url):
    tables = get_todays_info(the_url)
    the_lists = [html_table_to_list(table) for table in tables]
    return the_lists

def get_todays_json(the_url):
    # Get lists from today's tables
    todays_lists = get_todays_lists(the_url)

    # Get date table was updated
    todays_date_list = str(todays_lists[-1][0][0]).split('\n')[-2].split("Report ending date")[-1].split(' ')[-3:]
    todays_date_string = todays_date_list[0]+" "+todays_date_list[1]+" "+todays_date_list[2]
    todays_date = datetime.datetime.strptime(todays_date_string, '%B %d, %Y')
    
    todays_json = {}
    todays_json["date"] = todays_date_string
    todays_json["tests"] = {}
    todays_json["populations"] = {}
    todays_json["city_cases"] = {}
    for row in todays_lists[1][1:]:
        if row[0] != 'Total Population':
            todays_json["city_cases"][row[0]] = int(row[-1])
    for row in todays_lists[1][1:-5]:
        todays_json["populations"][row[0]] = int(row[1].replace(',', ''))
    todays_json["tests"]["people_tested"] = int(todays_lists[0][0][-1].replace(',', ''))
    todays_json["tests"]["kits_available"] = int(todays_lists[0][1][-1].replace(',', ''))

    todays_json["Stats"] = {}
    rows = ["TotalCases", "Deaths", "TravelRelated", "PersonToPerson", "CommunityAcquired", "UnderInvestigation"]
    cols = ["Total", "Male", "Female", "OtherGender", "Under18", "18to49", "50to64", "65andUp", "UnknownAge"]
    for i,row in enumerate(todays_lists[2][3:-1]):
        todays_json["Stats"][rows[i]] = {}
        for j,col in enumerate(cols):
            todays_json["Stats"][rows[i]][cols[j]] = int(row[j+1].replace('\n','').replace(',',''))
    return todays_date_string, todays_json

def append_todays_json(the_out_filename, the_url):
    date_string, todays_json = get_todays_json(url)
    with open(the_out_filename, 'r') as saved_json_file:
        saved_json = json.load(saved_json_file)
    if date_string not in saved_json:
        out_json = saved_json
        out_json[date_string] = todays_json
        with open(the_out_filename, 'w') as out_json_file:
            json.dump(out_json, out_json_file)

In [None]:
# First time, just output a base file
# todays_json = get_todays_json(url)
# json_base = {}
# json_base["March 26, 2020"] = todays_json
# with open(out_filename, 'w') as file:
#     json.dump(json_base, file)

In [None]:
# Append to the base file if today's data is different
append_todays_json(out_filename, url)