## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime



## Download

Retrieve the page

In [4]:
url = "https://itwebservices.placer.ca.gov/coviddashboard/#cases-by-zip-coderegion"

In [5]:
page = requests.get(url, verify=False)



## Parse

In [6]:
soup = BeautifulSoup(page.content, "html.parser")

Find script tag with the Infogram data

In [7]:
for elem in soup(text=re.compile(r"Foresthill")):
    script_content = elem

In [9]:
soup

<!DOCTYPE html>

<html>
<script src="/bni_1896b1697d8ca9f980069c2600d67e25.js" type="text/javascript"></script> <script language="JavaScript" type="text/javascript">var _0x5aae=["cookie","x-bni-fpc=","; expires=Thu, 01 Jan 2037 00:00:00 UTC; path=/;","x-bni-rncf=1669745535919; expires=Thu, 01 Jan 2037 00:00:00 UTC; path=/;","get"];function fiprn(){( new fiprn_v2)[_0x5aae[4]](function(_0x6130x2,_0x6130x3){document[_0x5aae[0]]= _0x5aae[1]+ _0x6130x2+ _0x5aae[2],document[_0x5aae[0]]= _0x5aae[3]})}</script>
<head>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/css/bootstrap.min.css" integrity="sha384-xOolHFLEh07PJGoPkLv1IbcEPTNtaed2xpHsD9ESMhqIYd0nLMwNLD69Npy4HI+N" rel="stylesheet"/>
<script crossorigin="anonymous" integrity="sha384-+sLIOodYLS7CIrQpBjl+C7nPvqq+FbNUBDunl/OZv93DB7Ln/533i8e/mZXLi/P+" src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/js/bootstrap

In [8]:
data = json.loads(script_content)

NameError: name 'script_content' is not defined

In [None]:
zip_cities_list = data["x"]["calls"][1]["args"][-3]

In [None]:
dict_list = []

In [None]:
TAG_RE = re.compile(r"<[^>]+>")

In [None]:
for z in zip_cities_list:
    split_zips = z.split("<br/>")
    area = TAG_RE.sub("", str(split_zips[0]))
    zip_code, city = area.split(" - ")
    clean_cases = re.sub("Number of cases: ".lower(), "", str(split_zips[1]).lower())
    d = dict(area=area, city=city, zip_code=zip_code, confirmed_cases=clean_cases)
    dict_list.append(d)

In [None]:
df = pd.DataFrame(dict_list)

Get timestamp

In [None]:
time_div = soup.find("div", id="dashboard-data-last-updated")

In [None]:
timestamp = time_div.find("span", class_="value-output").get_text()

In [None]:
latest_date = pd.to_datetime(timestamp).date()

In [None]:
df["county_date"] = latest_date

In [None]:
df.insert(0, "county", "Placer")

Clean up

In [None]:
df["area"] = df["area"].str.replace(" - ", ": ")

In [None]:
df = df[["county", "area", "confirmed_cases", "county_date", "zip_code"]].rename(
    columns={"zip_code": "zip"}
)

## Vet

In [None]:
try:
    assert not len(df) < 32
except AssertionError:
    raise AssertionError("Placer County's scraper is missing rows")

In [None]:
try:
    assert not len(df) > 32
except AssertionError:
    raise AssertionError("Placer County's scraper has more rows than before")

## Export

Set date

In [None]:
tz = pytz.timezone("America/Los_Angeles")

In [None]:
today = datetime.now(tz).date()

In [None]:
slug = "placer"

In [None]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [None]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [None]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [None]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [None]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)