<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/saitama/saitama_covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import datetime
import json
import pathlib
import re
from collections import Counter
from urllib.parse import urljoin

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
from IPython.display import Image, display_png

In [None]:
def fetch_soup(url):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    return soup

In [None]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    # 同一ファイル名の場合はダウンロードしない
    if not p.exists():

        r = requests.get(url)

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

In [None]:
def fetch_csv(url, text):

    soup = fetch_soup(url)
    href = soup.find_all("a", title=re.compile(text))[-1].get("href")

    csv_soup = fetch_soup(urljoin(url, href))
    csv_href = csv_soup.find("p", class_="muted ellipsis").find("a").get("href")

    p = fetch_file(csv_href, "download")

    return p

In [None]:
def str2date(s):

    lst = list(map(int, re.findall("\d+", s)))
    lst.insert(0, None)

    return lst[-3:]

In [None]:
def dumps_json(file_name, json_data, dir="."):

    p = pathlib.Path(dir, file_name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="w") as fw:
        json.dump(json_data, fw, ensure_ascii=False, indent=4)

## プログラム

In [None]:
# 検査陽性者の状況
MAIN_SUMMARY_URL = "http://www.pref.saitama.lg.jp/a0701/shingatacoronavirus.html"

In [None]:
soup = fetch_soup(MAIN_SUMMARY_URL)

In [None]:
# 更新日付取得
s = soup.select_one("#tmp_contents > h2").get_text()
m = re.search("([0-9]+)月([0-9]+)日", s)

month, day = map(int, m.groups())

In [None]:
dt_now = datetime.datetime.now()

In [None]:
dt_update = dt_now.replace(month=month, day=day, hour=21, minute=0, second=0, microsecond=0)

if dt_now < dt_update:
    dt_update = dt_update.replace(year=dt_now.year -1)

str_update = dt_update.strftime("%Y/%m/%d %H:%M")

In [None]:
data = {"lastUpdate": str_update}

In [None]:
tag = soup.select_one("#tmp_contents > div > div.outline > ul")

In [None]:
# 人数取得
text = tag.get_text(strip=True)

temp = {}

In [None]:
print(text)

In [None]:
for i in re.finditer(
    r"(陽性確認者数|新規公表分|指定医療機関|一般医療機関|最重症者|重症者|宿泊療養|自宅療養等|新型コロナウイルス感染症を死因とする死亡|死亡|新規公表分|退院・療養終了)：?([0-9,]+)人?",
    text,
):

    temp[i.group(1)] = int(i.group(2).replace(",", ""))

In [None]:
for i in re.finditer(r"(自治体による検査|民間検査機関等による検査)（\d{1,2}月\d{1,2}日まで）：延べ([0-9,]+)人", text):
    temp[i.group(1)] = int(i.group(2).replace(",", ""))

In [None]:
m = re.search("(入院)：(指定医療機関)([0-9,]+)人\s*(一般医療機関)([0-9,]+)人\s*(計)([0-9,]+)人", text)
if m:
    temp[f"{m.group(1)}_{m.group(2)}"] = int(m.group(3).replace(",", ""))
    temp[f"{m.group(1)}_{m.group(4)}"] = int(m.group(5).replace(",", ""))
    temp[f"{m.group(1)}_{m.group(6)}"] = int(m.group(7).replace(",", ""))

In [None]:
m = re.search("(退院・療養終了)：(退院)([0-9,]+)人\s*(療養終了)([0-9,]+)人\s*(計)([0-9,]+)人?", text)
if m:
    temp[f"{m.group(1)}_{m.group(2)}"] = int(m.group(3).replace(",", ""))
    temp[f"{m.group(1)}_{m.group(4)}"] = int(m.group(5).replace(",", ""))
    temp[f"{m.group(1)}_{m.group(6)}"] = int(m.group(7).replace(",", ""))

In [None]:
img_url = urljoin(MAIN_SUMMARY_URL, soup.select_one("div#tmp_contents > div.outline_type1 > div > p > img").get("src"))
p_img = fetch_file(img_url)

In [None]:
temp

In [None]:
display_png(Image(str(p_img)))

In [None]:
temp["現在の患者数"] = temp["陽性確認者数"] - temp["退院・療養終了_計"] - temp["死亡"]

In [None]:
print("検査実施人数:", temp["自治体による検査"] + temp["民間検査機関等による検査"])
print("陽性患者数 (累計):", temp["陽性確認者数"])
print("現在の患者数:", temp["現在の患者数"])
print("入院中:", temp["入院_計"])
print("重症:", temp["重症者"] + temp["最重症者"])
print("宿泊療養:", temp["宿泊療養"])
print("自宅療養:", temp["自宅療養等"])
print("新規公表分:", temp["新規公表分"])
print("退院・療養終了:", temp["退院・療養終了_計"])
print("死亡:", temp["死亡"])

In [None]:
# 入院中
print(temp["入院_計"])
print(temp["現在の患者数"] - temp["宿泊療養"] - temp["自宅療養等"] - temp["新規公表分"])
print(temp["指定医療機関"] + temp["一般医療機関"])

In [None]:
hospital = [temp.get("入院_計"), temp["現在の患者数"] - temp["宿泊療養"] - temp["自宅療養等"] - temp["新規公表分"], temp["指定医療機関"] + temp["一般医療機関"]]
h = [k for k, v in Counter(hospital).items() if v > 1]
temp["入院中"] = h[0] if h else hospital[0]

In [None]:
data["main_summary"] = {
    "attr": "検査実施人数",
    "value": temp["自治体による検査"],
    "children": [
        {
            "attr": "陽性患者数",
            "value": temp["陽性確認者数"],
            "children": [
                {
                    "attr": "入院中",
                    "value": temp["入院中"],
                    "children": [
                        {
                            "attr": "軽症・中等症",
                            "value": temp["陽性確認者数"]
                            - temp["退院・療養終了_計"]
                            - temp["死亡"]
                            - temp["最重症者"]
                            - temp["重症者"],
                        },
                        {"attr": "重症", "value": temp["最重症者"] + temp["重症者"]},
                    ],
                },
                {"attr": "退院", "value": temp["退院・療養終了_計"]},
                {"attr": "死亡", "value": temp["死亡"]},
            ],
        }
    ],
}

In [None]:
# main_summary.json
main_summary = {
    "attr": "検査実施人数",
    "value": temp["自治体による検査"],
    "children": [
        {
            "attr": "陽性患者数",
            "value": temp["陽性確認者数"],
            "children": [
                {
                    "attr": "入院中",
                    "value": temp["入院中"],
                    "children": [
                        {"attr": "重症", "value": temp["最重症者"] + temp["重症者"]},
                    ],
                },
                {"attr": "宿泊療養", "value": temp["宿泊療養"]},
                {"attr": "自宅療養", "value": temp["自宅療養等"]},
                {"attr": "新規公表分", "value": temp["新規公表分"]},
                {"attr": "死亡", "value": temp["死亡"]},
                {"attr": "退院・療養終了", "value": temp["退院・療養終了_計"]},
            ],
        }
    ],
    "lastUpdate": str_update,
}

In [None]:
dumps_json("main_summary.json", main_summary, "data")

# 検査

## 前処理

In [None]:
# 検査数
KENSA_URL = "https://opendata.pref.saitama.lg.jp/data/dataset/covid19-kensa"
KENSA_TITLE = "^埼玉県が実施した新型コロナウイルス疑い例検査数"

In [None]:
# 検査
kensa_path = fetch_csv(KENSA_URL, KENSA_TITLE)

In [None]:
df_kensa = pd.read_csv(kensa_path, encoding="cp932")

In [None]:
df_date = (
    df_kensa["検査日"]
    .astype("str")
    .str.normalize("NFKC")
    .apply(str2date)
    .apply(pd.Series)
    .rename(columns={0: "year", 1: "month", 2: "day"})
)

In [None]:
df_date["year"] = df_date["year"].replace({20: 2020, 21: 2021}).fillna(method="ffill")
df_kensa["検査日"] = pd.to_datetime(df_date, errors="coerce")

In [None]:
df_kensa = df_kensa.set_index("検査日")
df_kensa.rename(columns={"検査数（延べ人数）": "小計"}, inplace=True)
df_kensa["日付"] = df_kensa.index.strftime("%Y-%m-%dT08:00:00.000Z")

## inspections_summary

In [None]:
df_insp_sum = df_kensa.loc[:, ["日付", "小計"]]

In [None]:
data["inspections_summary"] = {
    "data": df_insp_sum.to_dict(orient="records"),
    "date": str_update,
}

# 陽性患者数

## 前処理

In [None]:
# 陽性患者数
JOKYO_URL = "https://opendata.pref.saitama.lg.jp/data/dataset/covid19-jokyo"
JOKYO_TITLE = "^埼玉県内の新型コロナウイルス感染症の発生状況"

In [None]:
# 状況
jokyo_path = fetch_csv(JOKYO_URL, JOKYO_TITLE)

In [None]:
df_kanja = pd.read_csv(jokyo_path, encoding="cp932")

In [None]:
df_temp = (
    df_kanja["判明日"]
    .astype("str")
    .str.normalize("NFKC")
    .apply(str2date)
    .apply(pd.Series)
    .rename(columns={0: "year", 1: "month", 2: "day"})
)

In [None]:
df_temp["year"] = df_temp["year"].replace({20: 2020, 21: 2021}).fillna(method="ffill")
df_kanja["date"] = pd.to_datetime(df_temp, errors="coerce")

## チェック

In [None]:
# 2020年より前を抽出
df_kanja[df_kanja["date"] < datetime.datetime(2020, 1, 1)]

In [None]:
# 未来の日付を抽出
df_kanja[df_kanja["date"] > dt_now]

In [None]:
# 日付が空で調査中、発生届取り下げ、東京都発表、重複でないものを抽出
df_kanja[(df_kanja["date"].isna()) & ~((df_kanja["判明日"].isin(["調査中", "発生届取り下げ", "東京都発表"]) | df_kanja["判明日"].str.contains("重複", na=False)))]

## patients_summary

In [None]:
ser_patients_sum = df_kanja["date"].value_counts().sort_index()

In [None]:
dt_range = pd.date_range(ser_patients_sum.index[0], df_kensa.index[-1])
ser_patients_sum = ser_patients_sum.reindex(index=dt_range, fill_value=0)

In [None]:
df_patients_sum = pd.DataFrame({"小計": ser_patients_sum})

In [None]:
df_patients_sum["日付"] = df_patients_sum.index.strftime("%Y-%m-%dT08:00:00.000Z")

In [None]:
data["patients_summary"] = {
    "data": df_patients_sum.to_dict(orient="records"),
    "date": str_update,
}

## patients

In [None]:
df_kanja.rename(columns={"NO.": "No"}, inplace=True)

In [None]:
df_kanja["判明日"] = df_kanja["判明日"].fillna("調査中")

In [None]:
df_kanja["リリース日"] = df_kanja["date"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
df_kanja["リリース日"] = df_kanja["リリース日"].mask(df_kanja["判明日"] == "調査中", "調査中")

In [None]:
df_kanja["date"] = df_kanja["date"].dt.strftime("%Y-%m-%d")
df_kanja["date"] = df_kanja["date"].mask(df_kanja["判明日"] == "調査中", "調査中").fillna("調査中")

In [None]:
df_kanja["退院"] = ""

In [None]:
df_patients = df_kanja.loc[:, ["No", "リリース日", "年代", "性別", "居住地", "退院", "date"]].copy()
df_patients.dropna(subset=["リリース日"], inplace=True)
df_patients.fillna("", inplace=True)

In [None]:
data["patients"] = {
    "data": df_patients.to_dict(orient="records"),
    "date": str_update,
}

In [None]:
dumps_json("data.json", data, "data")