<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/kobe/kobe_covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pycurl
!pip install -U pandas

In [0]:
!pip install retry

In [0]:
import datetime
import json
import pathlib
import re
from urllib.parse import urljoin

In [0]:
from retry import retry

In [0]:
import pandas as pd
import pycurl
import requests
from bs4 import BeautifulSoup

In [0]:
OUT_DIR = "data"
DOWNLOAD_DIR = "download"

In [0]:
@retry(tries=4, delay=5, backoff=2)
def get_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)

    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as f:

        c = pycurl.Curl()
        c.setopt(c.URL, url)
        c.setopt(c.WRITEDATA, f)
        c.perform()
        c.close()

    return p

In [0]:
def my_parser(s):

    y = datetime.datetime.now().year
    m, d = map(int, re.findall("[0-9]{1,2}", s))

    return pd.Timestamp(year=y, month=m, day=d)

In [0]:
# スクレイピング
url = "https://www.city.kobe.lg.jp/a73576/kenko/health/infection/protection/covid_19.html"

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
# ダウンロード
files = []

for i in soup.find_all("a", class_="icon_excel openDataFile"):
    link = urljoin(url, i.get("href"))

    file = get_file(link, DOWNLOAD_DIR)

    files.append(file)

In [0]:
# 最終更新日
last_update = datetime.date.today().strftime("%Y/%m/%d %H:%M")

In [0]:
data = {}

data["lastUpdate"] = last_update

In [0]:
# 市内感染者の状況・検査件数
df_pcr = pd.read_excel(files[0], sheet_name="kobe", index_col=0, skipfooter=3)

df_pcr.columns = df_pcr.columns.str.replace(r"\s", "")

df_pcr.index = df_pcr.index.strftime("%Y-%m-%dT08:00:00Z")

In [0]:
# patients_summary
df_pats_sum = df_pcr["陽性者数（日別）"].reset_index().rename(columns={
    "index": "日付",
    "陽性者数（日別）": "小計"
})

data["patients_summary"] = {
    "date": last_update,
    "data": df_pats_sum.to_dict(orient="recodes")
}

In [0]:
# inspections_summary
df_insp_sum = df_pcr["検査実施人数（日別）"].reset_index().rename(columns={
    "index": "日付",
    "検査実施人数（日別）": "小計"
})

data["inspections_summary"] = {
    "date": last_update,
    "data": df_insp_sum.to_dict(orient="recodes")
}

In [0]:
# 相談件数
df_sodan = pd.read_excel(files[1],
                         sheet_name="相談件数",
                         index_col=0,
                         headers=None,
                         skiprows=1,
                         names=["日付", "窓口日別", "窓口累計", "保健所日別", "保健所累計"])

df_sodan.index = df_sodan.index.strftime("%Y-%m-%dT08:00:00Z")

In [0]:
df_sodan = df_sodan[df_sodan.index.notnull()]

In [0]:
# contacts_summary
df_conts_sum = df_sodan["窓口日別"].reset_index(
).rename(columns={
    "index": "日付",
    "窓口日別": "小計"
})

data["contacts_summary"] = {
    "date": last_update,
    "data": df_conts_sum.to_dict(orient="recodes")
}

In [0]:
# health_center_summary
df_hecen_sum = df_sodan["保健所日別"].reset_index().rename(
    columns={
        "index": "日付",
        "保健所日別": "小計"
    })

data["health_center_summary"] = {
    "date": last_update,
    "data": df_hecen_sum.to_dict(orient="recodes")
}

In [0]:
# patients
dfs = pd.read_html("https://www.city.kobe.lg.jp/a57337/kenko/health/corona_zokusei.html", header=0, index_col="番号")

dfs[0].rename(columns={"陽性判明日": "発表日"}, inplace=True)

In [0]:
df_pats = pd.concat([df.sort_index() for df in dfs]).reset_index(drop=True)

df_pats["発表日"] = df_pats["発表日"].apply(my_parser)

df_pats["判明日"] = df_pats["発表日"].dt.strftime("%Y-%m-%dT08:00:00Z")

df_pats["date"] = df_pats["発表日"].dt.strftime("%Y-%m-%d")

df_pats["年代"] = df_pats["年代"].astype(str) + "代"

df_pats["退院"] = None

df_patsm = (
    df_pats[~df_pats["備考"].astype(str).str.contains("市外在住")]
    .copy()
    .reset_index()
    .sort_values(["判明日", "index"])
)

df_patsm.loc[df_patsm["備考"].isnull(), "備考"] = None

data["patients"] = {
    "date": last_update,
    "data": df_patsm.loc[:, ["判明日", "date", "年代", "性別", "備考", "退院"]].to_dict(
        orient="recodes"
    ),
}

In [0]:
# main_summary
df_all = pd.read_excel(files[0], sheet_name="all", index_col=0)

df_all.columns = df_all.columns.str.replace(r"\s", "")

In [0]:
sr_all = df_all[df_all.index.notnull()].iloc[-1]
sr_all

In [0]:
data["main_summary"] = {
    "attr":
    "患者発生総数",
    "value":
    int(sr_all["患者発生総数【速報含む】"]),
    "children": [{
        "attr":
        "調査済患者総数",
        "value":
        int(sr_all["調査済患者総数"]),
        "children": [
            {
                "attr":
                "入院中",
                "value":
                int(sr_all["入院・入居中"]),
                "children": [
                    {
                        "attr": "軽症・中等症",
                        "value": int(sr_all["軽症・中等症"])
                    },
                    {
                        "attr": "重症",
                        "value": int(sr_all["重症"])
                    },
                ],
            },
            {
                "attr": "死亡",
                "value": int(sr_all["死亡（累計）"])
            },
            {
                "attr": "治癒確認",
                "value": int(sr_all["治癒確認（退院など）（累計）"])
            },
        ],
    }],
}

In [0]:
data

In [0]:
# data.json
p = pathlib.Path(OUT_DIR, "data.json")
p.parent.mkdir(parents=True, exist_ok=True)

with p.open(mode="w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)