In [58]:
# Based on https://gist.github.com/AO8/63b9a5acb9fb238cbed13a0269d14137
# Collects and parses data posted by OC Health
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import json

In [160]:
url = "https://occovid19.ochealthinfo.com/coronavirus-in-oc"

def get_todays_info(the_url):
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.findAll("table")
    tests_table = tables[0]
    city_table = tables[1]
    demo_table = tables[2]
    return (tests_table, city_table, demo_table)

def html_table_to_list(table):
    list_table = []
    for row in table.findAll("tr"):
        row_table = []
        for cell in row.findAll(["td", "th"]):
            row_table.append(cell.get_text())
        list_table.append(row_table)
    return list_table

def get_todays_lists(the_url):
    tables = get_todays_info(the_url)
    the_lists = [html_table_to_list(table) for table in tables]
    return the_lists

def get_todays_json(the_url):
    # Get lists from today's tables
    todays_lists = get_todays_lists(the_url)

    # Get date table was updated
    todays_date_list = str(todays_lists[-1][0][0]).split('\n')[-2].split("Report ending date")[-1].split(' ')[-3:]
    todays_date_string = todays_date_list[0]+" "+todays_date_list[1]+" "+todays_date_list[2]
    todays_date = datetime.datetime.strptime(todays_date_string, '%B %d, %Y')
    
    todays_json = {}
    todays_json["date"] = todays_date
    todays_json["tests"] = {}
    todays_json["populations"] = {}
    todays_json["city_cases"] = {}
    for row in todays_lists[1][1:]:
        if row[0] != 'Total Population':
            todays_json["city_cases"][row[0]] = int(row[-1])
    for row in todays_lists[1][1:-5]:
        todays_json["populations"][row[0]] = int(row[1].replace(',', ''))
    todays_json["tests"]["people_tested"] = int(todays_lists[0][0][-1].replace(',', ''))
    todays_json["tests"]["kits_available"] = int(todays_lists[0][1][-1].replace(',', ''))

    todays_json["Stats"] = {}
    rows = ["TotalCases", "Deaths", "TravelRelated", "PersonToPerson", "CommunityAcquired", "UnderInvestigation"]
    cols = ["Total", "Male", "Female", "OtherGender", "Under18", "18to49", "50to64", "65andUp", "UnknownAge"]
    for i,row in enumerate(todays_lists[2][3:-1]):
        todays_json["Stats"][rows[i]] = {}
        for j,col in enumerate(cols):
            todays_json["Stats"][rows[i]][cols[j]] = int(row[j+1].replace('\n','').replace(',',''))
    return todays_json


In [161]:
get_todays_json(url)

{'date': datetime.datetime(2020, 3, 26, 0, 0),
 'tests': {'people_tested': 4070, 'kits_available': 1151},
 'populations': {'Aliso Viejo': 51372,
  'Anaheim': 359339,
  'Brea': 45606,
  'Buena Park': 83384,
  'Costa Mesa': 115830,
  'Cypress': 49833,
  'Dana Point': 34249,
  'Fountain Valley': 56652,
  'Fullerton': 142824,
  'Garden Grove': 175155,
  'Huntington Beach': 203761,
  'Irvine': 280202,
  'La Habra': 63542,
  'Laguna Niguel': 66748,
  'Lake Forest': 86346,
  'Mission Viejo': 96434,
  'Newport Beach': 87180,
  'Orange': 141691,
  'Placentia': 52333,
  'Rancho Santa Margarita': 48960,
  'San Clemente': 65405,
  'San Juan Capistrano': 36821,
  'Santa Ana': 337716,
  'Seal Beach': 25073,
  'Stanton': 39307,
  'Tustin': 81369,
  'Westminster': 92610,
  'Yorba Linda': 68706},
 'city_cases': {'Aliso Viejo': 2,
  'Anaheim': 28,
  'Brea': 1,
  'Buena Park': 7,
  'Costa Mesa': 8,
  'Cypress': 6,
  'Dana Point': 7,
  'Fountain Valley': 5,
  'Fullerton': 7,
  'Garden Grove': 4,
  'Huntin

In [68]:
"–" == "–"

True