<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/aichi/aichi_patients_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
from urllib.parse import urljoin

In [0]:
import re

In [0]:
import datetime

In [0]:
url = "https://www.pref.aichi.jp/site/covid19-aichi/kansensya-kensa.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
tag = soup.find("a", text=re.compile("県内発生事例一覧.+Excelファイル"))

In [0]:
link = urljoin(url, tag.get("href"))

In [0]:
m = re.search("(\d{1,2})月(\d{1,2})日", tag.get_text(strip=True))

In [0]:
dt_now = datetime.datetime.now()

In [0]:
month, day = map(int, m.groups())

In [12]:
tag.get_text(strip=True)

'県内発生事例一覧(4月22日現在) [Excelファイル／34KB]'

※更新日は4月22日

In [13]:
dt_update = datetime.datetime(dt_now.year, month, day)
dt_update

datetime.datetime(2020, 4, 22, 0, 0)

In [0]:
import pandas as pd

In [0]:
import json

In [0]:
data = {}

In [0]:
df_kanja = pd.read_excel(link, index_col="No", header=2, na_values=0)

In [0]:
df_kanja["発表日"] = df_kanja["発表日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))

In [19]:
df_kanja.tail(10)

Unnamed: 0_level_0,発表日,年代・性別,国籍,住居地,接触状況,備考
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
440,2020-04-22,90代男性,日本,知多市,No.390と接触,本県発表172
441,2020-04-22,50代女性,,名古屋市,No.366と接触,名古屋市発表248
442,2020-04-22,40代男性,,名古屋市,"No.379,417と接触",名古屋市発表249
443,2020-04-22,40代女性,,名古屋市,No.442と接触,名古屋市発表250
444,2020-04-22,10歳未満女性,,名古屋市,"No.442,443と接触",名古屋市発表251
445,2020-04-22,90代男性,,名古屋市,No.410と接触,名古屋市発表252
446,2020-04-22,60代女性,,名古屋市,No.445と接触,名古屋市発表253
447,2020-04-22,20代女性,,名古屋市,No.417と接触,名古屋市発表254
448,2020-04-22,50代男性,,名古屋市,,名古屋市発表255
449,2020-04-22,20代男性,,名古屋市,,名古屋市発表256


※末尾の日付も4月22日

※同じ日付のため更新日を３日後の4月25日に変更（あとで削除）

In [20]:
dt_update += datetime.timedelta(days=3)
dt_update

datetime.datetime(2020, 4, 25, 0, 0)

In [0]:
sr = df_kanja["発表日"].value_counts()

In [22]:
sr.sort_index().tail(10)

2020-04-13     9
2020-04-14    10
2020-04-15    12
2020-04-16    14
2020-04-17    10
2020-04-18    17
2020-04-19    10
2020-04-20     5
2020-04-21    16
2020-04-22    19
Name: 発表日, dtype: int64

※集計しても末尾は4月22日

In [0]:
# indexに更新日付が含まれるか確認、含まれない場合は更新日を0で追加
if dt_update not in sr.index:
    sr[dt_update] = 0

In [0]:
df_patients_sum = sr.sort_index().asfreq("D", fill_value=0).reset_index()

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients_sum.rename(columns={"発表日": "小計"}, inplace=True)
df_patients_sum.drop(columns=["index"], inplace=True)

In [27]:
df_patients_sum.tail(10)

Unnamed: 0,小計,日付
81,14,2020-04-16
82,10,2020-04-17
83,17,2020-04-18
84,10,2020-04-19
85,5,2020-04-20
86,16,2020-04-21
87,19,2020-04-22
88,0,2020-04-23
89,0,2020-04-24
90,0,2020-04-25


※4月23日～4月25日は0で補完されます

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.to_dict(orient="records"),
    "date": dt_update.strftime("%Y/%m/%d"),
}

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)