<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/tochigi/tochigi_xlsx_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# インストール

In [1]:
!pip install jaconv
!pip install -U pandas

Collecting jaconv
  Downloading https://files.pythonhosted.org/packages/b0/9e/cf1353fb3e81a177bb52ca59a0ebee425f084b7298039a7965c5414d2d62/jaconv-0.2.4.tar.gz
Building wheels for collected packages: jaconv
  Building wheel for jaconv (setup.py) ... [?25l[?25hdone
  Created wheel for jaconv: filename=jaconv-0.2.4-cp36-none-any.whl size=12285 sha256=f90a557b64860e131192a6e0cccafad71e937444e07677b330955f458aeb0c83
  Stored in directory: /root/.cache/pip/wheels/e1/46/f7/85a7f89bd3263423c8530dfed16083f9a142cc0fc78c81ff32
Successfully built jaconv
Installing collected packages: jaconv
Successfully installed jaconv-0.2.4
Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (1.0.3)


In [2]:
!pip install simplejson

Collecting simplejson
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/a7b98aa9256c8843f92878966dc3d8d914c14aad97e2c5ce4798d5743e07/simplejson-3.17.0.tar.gz (83kB)
[K     |████                            | 10kB 15.8MB/s eta 0:00:01[K     |███████▉                        | 20kB 1.5MB/s eta 0:00:01[K     |███████████▉                    | 30kB 2.2MB/s eta 0:00:01[K     |███████████████▊                | 40kB 2.9MB/s eta 0:00:01[K     |███████████████████▊            | 51kB 2.1MB/s eta 0:00:01[K     |███████████████████████▋        | 61kB 2.3MB/s eta 0:00:01[K     |███████████████████████████▋    | 71kB 2.7MB/s eta 0:00:01[K     |███████████████████████████████▌| 81kB 3.1MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 2.7MB/s 
[?25hBuilding wheels for collected packages: simplejson
  Building wheel for simplejson (setup.py) ... [?25l[?25hdone
  Created wheel for simplejson: filename=simplejson-3.17.0-cp36-cp36m-linux_x86_64.whl size=114

In [0]:
import datetime
import re
from urllib.parse import urljoin

In [0]:
import jaconv
import pandas as pd
import requests
import simplejson as json
from bs4 import BeautifulSoup

In [0]:
from google.colab import files

# スクレイピング

In [0]:
url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
text = soup.find("h2", text="栃木県における新型コロナウイルス感染症の検査実施状況について").find_next_sibling("p").get_text(strip=True)

In [9]:
text

'令和2(2020)年4月24日までに、栃木県（宇都宮市保健所実施分を含む）が実施した新型コロナウイルス感染症の検査件数は、1,724件です。（うち陽性52件）'

In [10]:
# 半角数字のみ抽出
kensa = [int(i.replace(",", "")) for i in re.findall("([0-9,]+)", text)]
kensa

[2, 2020, 4, 24, 1724, 52]

In [16]:
# 日付に変換
dt_kensa = datetime.datetime(kensa[1], kensa[2], kensa[3])
dt_kensa

datetime.datetime(2020, 4, 24, 0, 0)

In [0]:
# 自動化時に有効にする
# df_insp = pd.read_csv("inspections.csv"", index_col="年月日", parse_dates=True)

# 自動化まではスプレッドシートを参照
df_insp = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vS8KB87AREAsj5NIkdyubw0t8VcOrRBPaG2igOjwl-GZ8kOrJmgmskgFKTCUFdiWEQRyB5L1idP00Av/pub?gid=0&single=true&output=csv", index_col="年月日", parse_dates=True)

In [18]:
df_insp

Unnamed: 0_level_0,検査累計,陽性累計
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-14,0,0
2020-01-15,0,0
2020-01-16,0,0
2020-01-17,0,0
2020-01-18,0,0
...,...,...
2020-04-19,1472,46
2020-04-20,1520,49
2020-04-21,1568,51
2020-04-22,1617,52


In [0]:
# 最新検査日の累計を追加
df_insp.loc[dt_kensa] = kensa[4:6]

In [0]:
# CSVに保存
df_insp.to_csv("inspections.csv")

In [0]:
tag = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧.+エクセル"))

In [0]:
link = urljoin(url, tag.get("href"))

# データラングリング

In [0]:
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")

dt_now = datetime.datetime.now(JST)
dt_update = dt_now.strftime("%Y/%m/%d %H:%M")

In [0]:
data = {"lastUpdate": dt_update}

## 検査件数

In [0]:
df_insp_sum = df_insp["検査累計"].diff().fillna(0).astype(int).sort_index().asfreq("D", fill_value=0).reset_index()

In [0]:
df_insp_sum["日付"] = df_insp_sum["年月日"].dt.strftime("%Y-%m-%d") 

In [0]:
data["inspections_summary"] = {
    "data": df_insp_sum.loc[:, ["日付", "検査累計"]].values.tolist(),
    "date": dt_update,
}

## xlsxから発生状況一覧を抽出

In [0]:
df_kanja = pd.read_excel(link, index_col="番号", header=1, skipfooter=2)

In [0]:
df_kanja["陽性判明日"] = df_kanja["陽性判明日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))

In [0]:
df_kanja["退院日"] = df_kanja["退院日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))

In [31]:
df_kanja

Unnamed: 0_level_0,年代,性別,居住地,陽性判明日,退院日,備考
番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,60代,女性,県南,2020-02-22,2020-03-27,クルーズ船下船
2,30代,女性,県南,2020-03-05,2020-03-12,大阪ライブハウス、ショッピングセンター勤務
3,40代,女性,宇都宮,2020-03-18,2020-04-01,タイ旅行　※宇都宮市1例目
4,50代,男性,県南,2020-03-20,2020-04-03,ポルトガル旅行
5,40代,男性,フィリピン,2020-03-24,2020-04-10,フィリピンから帰国
6,70代,男性,安足,2020-03-24,2020-04-20,親族との接触あり
7,60代,女性,安足,2020-03-25,2020-04-03,No.6の妻
8,50代,男性,県西,2020-03-25,2020-04-11,No.6の同僚
9,50代,女性,県西,2020-03-25,NaT,No.8の妻
10,40代,男性,県南,2020-03-25,2020-04-18,別の新型コロナ感染者と濃厚接触


In [0]:
df_kanja["退院"] = df_kanja["退院日"].dt.strftime("%Y-%m-%d")

In [0]:
df_kanja["状態"]  = "入院中"

In [0]:
df_kanja["状態"] = df_kanja["状態"].where(df_kanja["退院日"].isnull(), "退院")

## main_summary

In [0]:
sr_situ = df_kanja["状態"].value_counts()

In [36]:
sr_situ = sr_situ.reindex(["入院中", "退院", "死亡"], fill_value=0)
sr_situ

入院中    39
退院     13
死亡      0
Name: 状態, dtype: int64

In [0]:
data["main_summary"] = {
    "attr": "検査実施人数",
    "value": kensa[4],
    "children": [
        {
            "attr": "陽性患者数",
            "value": kensa[5],
            "children": [
                {"attr": "入院中", "value": int(sr_situ["入院中"])},
                {"attr": "退院", "value": int(sr_situ["退院"])},
                {"attr": "死亡", "value": int(sr_situ["死亡"])},
            ],
        }
    ],
}

## patients

In [0]:
df_kanja["リリース日"] = df_kanja["陽性判明日"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients = df_kanja.loc[:, ["リリース日", "居住地", "年代", "性別","退院"]]

In [0]:
data["patients"] = {
    "data": df_patients.to_dict(orient="records"),
    "date": dt_update,
}

## patients_summary

In [0]:
df_patients_sum = (df_kanja["陽性判明日"].value_counts().sort_index().asfreq("D", fill_value=0).reset_index())

In [0]:
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%d")

In [0]:
df_patients_sum.rename(columns={"陽性判明日": "小計"}, inplace=True)

df_patients_sum.drop(columns=["index"], inplace=True)

In [0]:
data["patients_summary"] = {
    "data": df_patients_sum.loc[:, ["日付", "小計"]].values.tolist(),
    "date": dt_update,
}

In [0]:
with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ignore_nan=True, ensure_ascii=False, indent=4)

# ダウンロード

In [0]:
from google.colab import files

In [0]:
files.download("inspections.csv")

In [0]:
files.download("data.json")