<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/saitama/saitama_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pycurl
!pip install retry

Collecting pycurl
[?25l  Downloading https://files.pythonhosted.org/packages/ef/05/4b773f74f830a90a326b06f9b24e65506302ab049e825a3c0b60b1a6e26a/pycurl-7.43.0.5.tar.gz (216kB)
[K     |█▌                              | 10kB 17.7MB/s eta 0:00:01[K     |███                             | 20kB 1.8MB/s eta 0:00:01[K     |████▌                           | 30kB 2.3MB/s eta 0:00:01[K     |██████                          | 40kB 2.6MB/s eta 0:00:01[K     |███████▌                        | 51kB 2.0MB/s eta 0:00:01[K     |█████████                       | 61kB 2.3MB/s eta 0:00:01[K     |██████████▋                     | 71kB 2.5MB/s eta 0:00:01[K     |████████████                    | 81kB 2.8MB/s eta 0:00:01[K     |█████████████▋                  | 92kB 3.0MB/s eta 0:00:01[K     |███████████████                 | 102kB 2.8MB/s eta 0:00:01[K     |████████████████▋               | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████▏             | 122kB 2.8MB/s eta 0:00:01[K

In [0]:
JOKYO_URL = "https://opendata.pref.saitama.lg.jp/data/dataset/covid19-jokyo"
KENSA_URL = "https://opendata.pref.saitama.lg.jp/data/dataset/covid19-kensa"

In [0]:
JOKYO_TITLE = "^埼玉県内の新型コロナウイルス感染症の発生状況"
KENSA_TITLE = "^埼玉県が実施した新型コロナウイルス疑い例検査数"

In [0]:
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

In [0]:
DOWNLOAD_DIR = "download"
DATA_DIR = "data"

In [0]:
import pathlib

In [0]:
import pycurl
from retry import retry

In [0]:
@retry(tries=5, delay=5, backoff=3)
def get_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)

    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as f:

        c = pycurl.Curl()
        c.setopt(c.URL, url)
        c.setopt(c.USERAGENT, USER_AGENT)
        c.setopt(c.WRITEDATA, f)
        c.perform()
        c.close()

    return p

In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
from urllib.parse import urljoin

In [0]:
import re

In [0]:
headers = {"User-Agent": USER_AGENT}

In [0]:
def csv_link(url):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    link = soup.find("p", class_="muted ellipsis").find("a").get("href")

    return link

In [0]:
def get_csv(url, text):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    href = soup.find_all("a", title=re.compile(text))[-1].get("href")

    link = csv_link(urljoin(url, href))

    p = get_file(link, DOWNLOAD_DIR)

    return p

In [0]:
import pandas as pd

In [0]:
import datetime

In [0]:
dt_now = datetime.datetime.now()

In [0]:
url = "http://www.pref.saitama.lg.jp/a0701/shingatacoronavirus.html"

In [0]:
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")

In [0]:
# main_summary
tag = soup.find("div", class_="box_info_ttl")

In [0]:
# 更新日付取得
s_date = tag.find("span", class_="txt_big").get_text(strip=True)
l_date = list(map(int, re.findall("(\d{1,2})", s_date)))

dt_update = datetime.datetime(dt_now.year, *l_date, 21, 0).strftime("%Y/%m/%d %H:%M")

data = {"lastUpdate": dt_update}

In [0]:
# 人数取得
main_sum = [int(i.replace(",", "")) for i in re.findall("([0-9,]+)人", tag.get_text())]

In [23]:
print(main_sum)

[836, 7, 688, 672, 16, 118, 30, 4258, 2844]


In [0]:
data["main_summary"] = {
    "attr": "検査実施人数",
    "value": main_sum[7],
    "children": [
        {
            "attr": "陽性患者数",
            "value": main_sum[0],
            "children": [
                {
                    "attr": "入院中",
                    "value": main_sum[2],
                    "children": [
                        {"attr": "軽症・中等症", "value": main_sum[3]},
                        {"attr": "重症", "value": main_sum[4]},
                    ],
                },
                {"attr": "退院", "value": main_sum[5]},
                {"attr": "死亡", "value": main_sum[6]},
            ],
        }
    ],
}

# 状況

In [0]:
jokyo_path = get_csv(JOKYO_URL, JOKYO_TITLE)

In [0]:
df_kanja = pd.read_csv(jokyo_path, encoding="cp932")

In [0]:
df_kanja["date"] = pd.to_datetime(df_kanja["判明日"], errors="coerce")

In [0]:
df_patients_sum = (
    df_kanja["date"].value_counts().sort_index().asfreq("D", fill_value=0).reset_index()
)

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%dT08:00:00.000Z")

In [0]:
df_patients_sum.rename(columns={"date": "小計"}, inplace=True)

In [0]:
df_patients_sum.drop(columns=["index"], inplace=True)

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.to_dict(orient="records"),
    "date": dt_update,
}

In [0]:
df_kanja.rename(columns={"No.": "No"}, inplace=True)

In [0]:
df_kanja["リリース日"] = df_kanja["date"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
df_kanja["date"] = df_kanja["date"].dt.strftime("%Y-%m-%d")

In [0]:
df_kanja["リリース日"] = df_kanja["リリース日"].mask(df_kanja["判明日"] == "調査中", "調査中")
df_kanja["date"] = df_kanja["date"].mask(df_kanja["判明日"] == "調査中", "調査中")

In [0]:
df_kanja["退院"] = ""

In [0]:
df_patients = df_kanja.loc[:, ["No", "リリース日", "年代", "性別", "居住地", "退院", "date"]].copy()

In [0]:
df_patients.fillna("", inplace=True)

In [0]:
data["patients"] = {
    "data": df_patients.to_dict(orient="records"),
    "date": dt_update,
}

## 検査

In [0]:
kensa_path = get_csv(KENSA_URL, KENSA_TITLE)

In [0]:
df_kensa = pd.read_csv(kensa_path, encoding="cp932", index_col="検査日", parse_dates=True)

In [0]:
df_kensa.rename(columns={"検査数（延べ人数）": "小計"}, inplace=True)

In [0]:
df_kensa["日付"] = df_kensa.index.strftime("%Y-%m-%dT08:00:00.000Z")

In [0]:
df_insp_sum = df_kensa.loc[:, ["日付", "小計"]]

In [0]:
data["inspections_summary"] = {
    "data": df_insp_sum.to_dict(orient="records"),
    "date": dt_update,
}

In [0]:
import json

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)

In [0]:
from google.colab import files

In [0]:
files.download("data.json")