<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/tochigi/tochigi_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# インストール

In [1]:
!apt install python3-tk ghostscript
!pip install camelot-py[cv]

!pip install jaconv
!pip install -U pandas

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-tk is already the newest version (3.6.9-1~18.04).
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono gsfonts libcupsfilters1 libcupsimage2
  libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
Suggested packages:
  fonts-noto ghostscript-x poppler-utils fonts-japanese-mincho
  | fonts-ipafont-mincho fonts-japanese-gothic | fonts-ipafont-gothic
  fonts-arphic-ukai fonts-arphic-uming fonts-nanum
The following NEW packages will be installed:
  fonts-droid-fallback fonts-noto-mono ghostscript gsfonts libcupsfilters1
  libcupsimage2 libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
0 upgraded, 11 newly installed, 0 to remove and 25 not upgraded.
Need to get 14.1 MB of archives.
After this operation, 49.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-droid-fallback all 1:6.0.

In [0]:
import datetime
import json
import re
from urllib.parse import urljoin

In [0]:
import jaconv
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [0]:
import camelot

In [0]:
from google.colab import files

# スクレイピング

In [0]:
url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
text = soup.find("h2", text="栃木県における新型コロナウイルス感染症の検査実施状況について").find_next_sibling("p").get_text(strip=True)

In [9]:
text

'令和2(2020)年4月22日までに、栃木県（宇都宮市保健所実施分を含む）が実施した新型コロナウイルス感染症の検査件数は、1,617件です。（うち陽性52件）'

In [10]:
# 半角数字のみ抽出
kensa = [int(i.replace(",", "")) for i in re.findall("([0-9,]+)", text)]
kensa

[2, 2020, 4, 22, 1617, 52]

In [11]:
# 日付に変換
dt_kensa = datetime.datetime(kensa[1], kensa[2], kensa[3])
dt_kensa

datetime.datetime(2020, 4, 22, 0, 0)

In [0]:
# 自動化時に有効にする
# df_insp = pd.read_csv("inspections.csv"", index_col="年月日", parse_dates=True)

# 自動化まではスプレッドシートを参照
df_insp = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vS8KB87AREAsj5NIkdyubw0t8VcOrRBPaG2igOjwl-GZ8kOrJmgmskgFKTCUFdiWEQRyB5L1idP00Av/pub?gid=0&single=true&output=csv", index_col="年月日", parse_dates=True)

In [13]:
df_insp

Unnamed: 0_level_0,検査累計,陽性累計
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-14,0,0
2020-01-15,0,0
2020-01-16,0,0
2020-01-17,0,0
2020-01-18,0,0
...,...,...
2020-04-18,1424,44
2020-04-19,1472,46
2020-04-20,1520,49
2020-04-21,1568,51


In [0]:
# 最新検査日の累計を追加
df_insp.loc[dt_kensa] = kensa[4:6]

In [0]:
# CSVに保存
df_insp.to_csv("inspections.csv")

In [0]:
tag = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧"))

In [0]:
link = urljoin(url, tag.get("href"))

# データラングリング

In [0]:
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")

dt_now = datetime.datetime.now(JST)
dt_update = dt_now.strftime("%Y/%m/%d %H:%M")

In [0]:
data = {"lastUpdate": dt_update}

## 検査件数

In [0]:
df_insp_sum = df_insp["検査累計"].diff().fillna(0).astype(int).sort_index().asfreq("D", fill_value=0).reset_index()

In [0]:
df_insp_sum["日付"] = df_insp_sum["年月日"].dt.strftime("%Y-%m-%d") 

In [0]:
data["inspections_summary"] = {
    "data": df_insp_sum.loc[:, ["日付", "検査累計"]].values.tolist(),
    "date": dt_update,
}

## PDFから発生状況一覧をテキスト抽出

In [0]:
import camelot

In [0]:
tables = camelot.read_pdf(link, pages="all", split_text=True, strip_text="\n", line_scale=40)

In [0]:
dfs = [table.df for table in tables]

In [0]:
df_tmp = pd.concat(dfs).reset_index(drop=True)

In [0]:
df_kanja = df_tmp.T.set_index(0).T.set_index("番号")

In [0]:
df_kanja["陽性判明日"] = df_kanja["陽性判明日"].apply(lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True))

In [0]:
df_date = df_kanja["陽性判明日"].str.extract(r"(\d{1,2}/\d{1,2})\s*(\((\d{1,2}/\d{1,2}) +(.+)\))?", expand=True)

## main_summary

In [0]:
sr_situ = df_date[3].fillna("入院中").value_counts()

In [31]:
sr_situ = sr_situ.reindex(["入院中", "退院", "死亡"], fill_value=0)
sr_situ

入院中    39
退院     13
死亡      0
Name: 3, dtype: int64

In [0]:
data["main_summary"] = {
    "attr": "検査実施人数",
    "value": kensa[4],
    "children": [
        {
            "attr": "陽性患者数",
            "value": kensa[5],
            "children": [
                {"attr": "入院中", "value": int(sr_situ["入院中"])},
                {"attr": "退院", "value": int(sr_situ["退院"])},
                {"attr": "死亡", "value": int(sr_situ["死亡"])},
            ],
        }
    ],
}

## patients

In [0]:
df_date[0].fillna("", inplace=True)
df_date[2].fillna("", inplace=True)

In [34]:
df_date

Unnamed: 0_level_0,0,1,2,3
番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2/22,(3/27 退院),3/27,退院
2,3/5,(3/12 退院),3/12,退院
3,3/18,(4/1 退院),4/1,退院
4,3/20,(4/3 退院),4/3,退院
5,3/24,(4/10 退院),4/10,退院
6,3/24,(4/20 退院),4/20,退院
7,3/25,(4/3 退院),4/3,退院
8,3/25,(4/11 退院),4/11,退院
9,3/25,,,
10,3/25,(4/18 退院),4/18,退院


In [0]:
def my_parser(s):

    if s:
        y = dt_now.year
        m, d = map(int, re.findall("[0-9]{1,2}", s))

        return pd.Timestamp(year=y, month=m, day=d)

    else:
        return pd.NaT

In [0]:
def date_str(s):

    if s:
        y = dt_now.year
        m, d = map(int, re.findall("[0-9]{1,2}", s))

        return datetime.datetime(y, m, d).strftime("%Y-%m-%d")

    else:
        return None

In [0]:
df_kanja["陽性判明日"] = df_date[0].apply(my_parser)

In [0]:
df_kanja["退院日"] = df_date[2].apply(my_parser)

In [0]:
df_kanja["退院"] = df_date[2].apply(date_str)

In [0]:
df_kanja["リリース日"] = df_kanja["陽性判明日"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients = df_kanja.loc[:, ["リリース日", "居住地", "年代", "性別","退院"]]

In [0]:
data["patients"] = {
    "data": df_patients.to_dict(orient="records"),
    "date": dt_update,
}

## patients_summary

In [0]:
df_patients_sum = (df_kanja["陽性判明日"].value_counts().sort_index().asfreq("D", fill_value=0).reset_index())

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients_sum.rename(columns={"陽性判明日": "小計"}, inplace=True)

df_patients_sum.drop(columns=["index"], inplace=True)

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.loc[:, ["日付", "小計"]].values.tolist(),
    "date": dt_update,
}

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)

# ダウンロード

In [0]:
from google.colab import files

In [0]:
files.download("inspections.csv")

In [0]:
files.download("data.json")