<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/aichi/aichi_pdf_patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt install python3-tk ghostscript
!pip install camelot-py[cv]

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ghostscript is already the newest version (9.26~dfsg+0-0ubuntu0.18.04.12).
python3-tk is already the newest version (3.6.9-1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [0]:
import datetime
import re
from urllib.parse import urljoin

In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
url = "https://www.pref.aichi.jp/site/covid19-aichi/kansensya-kensa.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
tag = soup.find("a", text=re.compile("県内発生事例一覧.+PDFファイル"))

In [0]:
link = urljoin(url, tag.get("href"))

## 更新日

In [0]:
JST = datetime.timezone(datetime.timedelta(hours=+9), 'JST')

In [0]:
dt_now = datetime.datetime.now(JST)

In [0]:
m = re.search("(\d{1,2})月(\d{1,2})日", tag.get_text(strip=True))

In [0]:
month, day = map(int, m.groups())

In [0]:
dt_update = datetime.datetime(dt_now.year, month, day)
dt_update

datetime.datetime(2020, 5, 12, 0, 0)

In [0]:
tag.get_text(strip=True)

'県内発生事例一覧(5月12日現在) [PDFファイル／172KB]'

In [0]:
dt_tomorrow = dt_update + datetime.timedelta(days=1)

In [0]:
import pandas as pd
import json
import camelot

In [0]:
tables = camelot.read_pdf(link, pages="all", split_text=True, strip_text="\n", line_scale=40)

In [0]:
df_csv = pd.concat([table.df for table in tables])

df_csv.to_csv("partients.csv", index=None, header=None)

In [0]:
data = {"lastUpdate": dt_now.strftime("%Y/%m/%d %H:%M")}

In [0]:
df_kanja = pd.read_csv("partients.csv")

In [0]:
def my_parser(s):

    y = dt_now.year
    m, d = map(int, re.findall("[0-9]{1,2}", s))

    return pd.Timestamp(year=y, month=m, day=d)

In [0]:
df_kanja["発表日"] = df_kanja["発表日"].apply(my_parser)

In [0]:
sr = df_kanja["発表日"].value_counts()

In [0]:
df_kanja["w"] = str(df_kanja["発表日"].dt.dayofweek + 1 % 7)

In [0]:
df_kanja["date"] = df_kanja["発表日"].dt.strftime("%Y-%m-%d")

In [0]:
df_kanja["short_date"] = df_kanja["発表日"].dt.strftime("%m\\/%d")

In [0]:
df_kanja["w"] = (df_kanja["発表日"].dt.dayofweek + 1) % 7
df_kanja["w"] = df_kanja["w"].astype(str)

In [0]:
df_kanja["No"] = df_kanja["No"].astype(str)

In [0]:
df_kanja["発表日"] = df_kanja["発表日"].dt.strftime("%Y/%m/%d %H:%M")

In [0]:
df_kanja.fillna("", inplace=True)

In [0]:
data["patients"] = {
    "data": df_kanja.to_dict(orient="recodes"),
    "date": dt_tomorrow.strftime("%Y/%m/%d %H:%M"),
}

In [0]:
sr.sort_index().tail(20)

2020-04-17    10
2020-04-18    17
2020-04-19    10
2020-04-20     5
2020-04-21    16
2020-04-22    19
2020-04-23    12
2020-04-24    14
2020-04-25     1
2020-04-26     1
2020-04-28     5
2020-04-29     4
2020-04-30     1
2020-05-01     3
2020-05-03     2
2020-05-04     3
2020-05-05     3
2020-05-09     3
2020-05-11     1
2020-05-12     1
Name: 発表日, dtype: int64

In [0]:
if dt_update not in sr.index:
    sr[dt_update] = 0

In [0]:
df_patients_sum = sr.sort_index().asfreq("D", fill_value=0).reset_index()

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients_sum.rename(columns={"発表日": "小計"}, inplace=True)
df_patients_sum.drop(columns=["index"], inplace=True)

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.to_dict(orient="records"),
    "date": dt_tomorrow.strftime("%Y/%m/%d %H:%M"),
}

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)