<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/tochigi/tochigi_xlsx_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# インストール

In [0]:
!pip install -U pandas

In [0]:
!pip install simplejson

In [0]:
import datetime
import re
from urllib.parse import urljoin

In [0]:
import pandas as pd
import requests
import simplejson as json
from bs4 import BeautifulSoup

In [0]:
from google.colab import files

In [0]:
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")

dt_now = datetime.datetime.now(JST)
dt_update = dt_now.strftime("%Y/%m/%d %H:%M")

In [0]:
data = {"lastUpdate": dt_update}

# データラングリング

In [0]:
url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

# 新型コロナウイルス感染症検査件数

## inspections_summary

In [0]:
tag_kensa = soup.find("a", text=re.compile("^新型コロナウイルス感染症検査件数.+エクセル"))

In [0]:
link_kensa = urljoin(url, tag_kensa.get("href"))

In [0]:
df_kensa = pd.read_excel(link_kensa, header=[2, 3])

In [0]:
df_kensa.columns = df_kensa.columns.to_flat_index()

In [0]:
df_kensa.rename(columns={("検査日", "Unnamed: 0_level_1"): "検査日"}, inplace=True)

In [0]:
df_kensa.set_index("検査日", inplace=True)

In [0]:
df_kensa["日付"] = df_kensa.index.strftime("%Y-%m-%d")

In [0]:
df_insp_sum = df_kensa.loc[:, ["日付", ("検査件数", "栃木県"), ("検査件数", "宇都宮市")]]

In [0]:
data["inspections_summary"] = {
    "data": df_insp_sum.values.tolist(),
    "date": dt_update,
}

# 栃木県における新型コロナウイルス感染症の発生状況一覧

In [0]:
tag_kanja = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧.+エクセル"))

In [0]:
link_kanja = urljoin(url, tag_kanja.get("href"))

In [0]:
df_kanja = pd.read_excel(link_kanja, index_col="番号", header=1, skipfooter=2)

In [0]:
df_kanja

In [0]:
df_kanja["陽性判明日"] = df_kanja["陽性判明日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))

In [0]:
df_kanja["退院日"] = df_kanja["退院日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))

In [0]:
df_kanja["退院"] = df_kanja["退院日"].dt.strftime("%Y-%m-%d")

In [0]:
df_kanja["状態"]  = "入院中"

In [0]:
df_kanja["状態"] = df_kanja["状態"].where(df_kanja["退院日"].isnull(), "退院")

## main_summary

In [0]:
sr_situ = df_kanja["状態"].value_counts()

In [0]:
sr_situ = sr_situ.reindex(["入院中", "退院", "死亡"], fill_value=0)
sr_situ

In [0]:
data["main_summary"] = {
    "attr": "検査実施人数",
    "value": int(df_kensa.iloc[-1][("累積検査件数", "合計")]),
    "children": [
        {
            "attr": "陽性患者数",
            "value": len(df_kanja),
            "children": [
                {"attr": "入院中", "value": int(sr_situ["入院中"])},
                {"attr": "退院", "value": int(sr_situ["退院"])},
                {"attr": "死亡", "value": int(sr_situ["死亡"])},
            ],
        }
    ],
}

## patients

In [0]:
df_kanja["リリース日"] = df_kanja["陽性判明日"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients = df_kanja.loc[:, ["リリース日", "居住地", "年代", "性別","退院"]]

In [0]:
data["patients"] = {
    "data": df_patients.to_dict(orient="records"),
    "date": dt_update,
}

## patients_summary

In [0]:
df_patients_sum = (df_kanja["陽性判明日"].value_counts().sort_index().asfreq("D", fill_value=0).reset_index())

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients_sum.rename(columns={"陽性判明日": "小計"}, inplace=True)

df_patients_sum.drop(columns=["index"], inplace=True)

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.loc[:, ["日付", "小計"]].values.tolist(),
    "date": dt_update,
}

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ignore_nan=True, ensure_ascii=False, indent=4)

# ダウンロード

In [0]:
from google.colab import files

In [0]:
files.download("data.json")