<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/toyama/toyama_patients_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
DATA_DIR = "../data/data.json"

In [0]:
!wget https://raw.githubusercontent.com/Terachan0117/covid19-toyama/development/data/data.json -O $DATA_DIR

In [0]:
!pip install jaconv

In [0]:
import datetime
import json
import re

import jaconv
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [0]:
# 現時点のデータを取得
data = json.load(open(DATA_DIR, "r", encoding="utf-8"))

In [0]:
# 現在時刻
dt_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
data["lastUpdate"] = dt_now

In [0]:
# 富山県HPから最新の情報を取得
url = "http://www.pref.toyama.jp/cms_sec/1205/kj00021798.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
# スクレイピング
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")

In [0]:
# 数字を半角に変換
text = jaconv.z2h(soup.select_one("div#main > p").get_text(strip=True), digit=True)

In [0]:
# 数字のみ抽出
main_sum = list(map(int, re.findall("[0-9,]+", text)))

In [0]:
# 一覧エクセルを取得
link = (
    soup.find("div", id="file")
    .find("a", text="富山県内における新型コロナウイルス感染症の発生状況一覧")
    .get("href")
)

df_kanjya = pd.read_excel(link, skiprows=2)

In [0]:
# エクセル内データを定義書準拠形式に変換
df_kanjya.rename(columns={"県番号": "No", "検査結果判明日": "判明日"}, inplace=True)

df_kanjya["判明日"] = df_kanjya["判明日"].apply(
    lambda date: pd.to_datetime(
        date, unit="D", origin=pd.Timestamp("1899/12/30")
    ).strftime("%Y-%m-%d")
)

In [0]:
df_kanjya["性別"] = df_kanjya["性別"].replace({"男": "男性", "女": "女性"})
df_kanjya["年代"] = df_kanjya["年代"].replace("90代", "90歳以上")

In [0]:
# 検査陽性者の状況
data["main_summary"] = {
    "date": dt_now,
    "children": [
        {
            "attr": "陽性患者数",
            "value": main_sum[0],
            "children": [
                {"attr": "入院", "value": main_sum[1],},
                {"attr": "退院", "value": main_sum[2],},
                {"attr": "死亡", "value": main_sum[3],},
            ],
        }
    ],
}

In [0]:
# 陽性患者の属性
df_patients = df_kanjya.loc[:, ["No", "判明日", "居住地", "年代", "性別"]].fillna("-")
data["patients"] = {"date": dt_now, "data": df_patients.to_dict(orient="records")}

In [0]:
# 居住地
city_names = [
    "富山市",
    "高岡市",
    "射水市",
    "南砺市",
    "氷見市",
    "魚津市",
    "黒部市",
    "滑川市",
    "小矢部市",
    "立山町",
    "入善町",
    "上市町",
    "朝日町",
    "舟橋村",
    "県外",
]

In [0]:
df_residence = (
    df_kanjya["居住地"]
    .where(df_kanjya["居住地"].isin(city_names), "県外")
    .value_counts()
    .reindex(index=city_names, fill_value=0)
    .reset_index()
)

In [0]:
df_residence.rename(columns={"居住地": "小計", "index": "居住地"}, inplace=True)

In [0]:
data["patients_by_residence"] = {
    "date": dt_now,
    "data": df_residence.to_dict(orient="records"),
}

In [0]:
# 年代
age_lists = ["10歳未満", "10代", "20代", "30代", "40代", "50代", "60代", "70代", "80代", "90歳以上"]

In [0]:
df_age = (
    df_kanjya["年代"].value_counts().reindex(index=age_lists, fill_value=0).reset_index()
)

In [0]:
df_age.rename(columns={"年代": "小計", "index": "年代"}, inplace=True)

In [0]:
data["patients_by_age"] = {"date": dt_now, "data": df_age.to_dict(orient="records")}

In [0]:
# 性別
gender_lists = ["男性", "女性", "その他"]

In [0]:
df_gender = (
    df_kanjya["性別"]
    .value_counts()
    .reindex(index=gender_lists, fill_value=0)
    .reset_index()
)
df_gender.rename(columns={"性別": "小計", "index": "性別"}, inplace=True)

In [0]:
data["patients_by_gender"] = {
    "date": dt_now,
    "data": df_gender.to_dict(orient="records"),
}

In [0]:
# data.json上書き
data_json = open(DATA_DIR, "w", encoding="utf-8")
json.dump(data, data_json, ensure_ascii=False, indent=4)