<a href="https://colab.research.google.com/github/imabari/toyama/blob/master/toyama_counter_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget http://opendata.pref.toyama.jp/files/covid19/20200403/toyama_counts.csv -O counts.csv

--2020-04-06 02:25:45--  http://opendata.pref.toyama.jp/files/covid19/20200403/toyama_counts.csv
Resolving opendata.pref.toyama.jp (opendata.pref.toyama.jp)... 202.213.106.6
Connecting to opendata.pref.toyama.jp (opendata.pref.toyama.jp)|202.213.106.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1050 (1.0K) [text/csv]
Saving to: ‘counts.csv’


2020-04-06 02:25:47 (173 MB/s) - ‘counts.csv’ saved [1050/1050]



# スクレイピング

In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
import re
import datetime

In [0]:
# 和暦から西暦のdateに変換
def wareki2date(s):

    m = re.match("(昭和|平成|令和)(\d{1,2})年(\d{1,2})月(\d{1,2})日", s)

    year = int(m.group(2))
    month = int(m.group(3))
    day = int(m.group(4))

    if m.group(1) == "昭和":
        year += 1925
    elif m.group(1) == "平成":
        year += 1988
    elif m.group(1) == "令和":
        year += 2018

    result = datetime.datetime(year, month, day)

    return result

In [0]:
url = "http://www.pref.toyama.jp/cms_sec/1205/kj00021629.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
result = {}

## 検査

In [0]:
s_kensa = soup.find("h4", text=re.compile("新型コロナウイルスPCR検査件数$")).find_next_sibling("p").get_text("\n", strip=True)
m_kensa = re.search("（(令和\d{1,2}年\d{1,2}月\d{1,2}日)まで）", s_kensa)

In [0]:
kensa = [int(i.replace("," ,"")) for i in re.findall("([0-9,]+)人", s_kensa)]

In [0]:
dt_kensa = wareki2date(m_kensa.group(1))

In [0]:
result["検査実施人数"] = kensa[0]
result["陽性人数"] = kensa[1]
result["陰性人数"] = kensa[2]

## 一般相談

In [0]:
s_ippan = soup.find("h4", text=re.compile("新型コロナウイルス感染症に関する一般相談件数$")).find_next_sibling("p").contents[0]
m_ippan = re.search("([0-9,]+)件（(令和\d{1,2}年\d{1,2}月\d{1,2}日)まで）", s_ippan)

In [0]:
result["一般相談件数"] = int(m_ippan.group(1).replace(",", ""))
dt_ippan = wareki2date(m_ippan.group(2))

## 帰国者相談

In [0]:
s_kikoku = soup.find("h4", text=re.compile("帰国者・接触者相談センターへの相談件数$")).find_next_sibling("p").contents[0]
m_kikoku = re.match("([0-9,]+)件（(令和\d{1,2}年\d{1,2}月\d{1,2}日)まで）", s_kikoku)

In [0]:
result["帰国者相談件数"] = int(m_kikoku.group(1).replace(",", ""))
dt_kikoku = wareki2date(m_kikoku.group(2))

# 集計

In [0]:
import pandas as pd

In [18]:
df = pd.read_csv("counts.csv", index_col="年月日", parse_dates=True, dtype={"備考": "object"})
df["備考"] = df["備考"].fillna("")
df.tail(10)

Unnamed: 0_level_0,検査実施人数,陰性人数,陽性人数,一般相談件数,帰国者相談件数,備考
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-24,4,4,0,72,7,
2020-03-25,2,2,0,87,8,
2020-03-26,1,1,0,73,9,
2020-03-27,3,3,0,131,12,
2020-03-28,6,6,0,58,2,
2020-03-29,0,0,0,40,5,
2020-03-30,4,3,1,217,16,
2020-03-31,14,11,3,446,12,
2020-04-01,25,22,3,781,37,
2020-04-02,39,38,1,601,25,


In [19]:
# 最新データ
s_cum = pd.Series(result, name=dt_kensa)
s_cum

検査実施人数      161
陽性人数          8
陰性人数        153
一般相談件数     6158
帰国者相談件数     386
Name: 2020-04-02 00:00:00, dtype: int64

In [0]:
# 累計を集計
df_cum = df.drop(columns="備考").cumsum().copy()

In [21]:
# 最新データを追加
df_cum = df_cum.append(s_cum)

# 重複削除
df_cum.drop_duplicates(inplace=True)

df_cum.tail(10)

Unnamed: 0_level_0,検査実施人数,陰性人数,陽性人数,一般相談件数,帰国者相談件数
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-24,67,67,0,3724,260
2020-03-25,69,69,0,3811,268
2020-03-26,70,70,0,3884,277
2020-03-27,73,73,0,4015,289
2020-03-28,79,79,0,4073,291
2020-03-29,79,79,0,4113,296
2020-03-30,83,82,1,4330,312
2020-03-31,97,93,4,4776,324
2020-04-01,122,115,7,5557,361
2020-04-02,161,153,8,6158,386


In [22]:
# 差分を計算
df_diff = df_cum.sort_index().diff().fillna(0).astype(int)
df_diff["備考"] = ""

df_diff.tail(10)

Unnamed: 0_level_0,検査実施人数,陰性人数,陽性人数,一般相談件数,帰国者相談件数,備考
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-24,4,4,0,72,7,
2020-03-25,2,2,0,87,8,
2020-03-26,1,1,0,73,9,
2020-03-27,3,3,0,131,12,
2020-03-28,6,6,0,58,2,
2020-03-29,0,0,0,40,5,
2020-03-30,4,3,1,217,16,
2020-03-31,14,11,3,446,12,
2020-04-01,25,22,3,781,37,
2020-04-02,39,38,1,601,25,


In [23]:
# 当日分
s_diff = df_diff.iloc[-1, :]
s_diff

検査実施人数      39
陰性人数        38
陽性人数         1
一般相談件数     601
帰国者相談件数     25
備考            
Name: 2020-04-02 00:00:00, dtype: object

In [0]:
# 当日分を追加
df = df.append(s_diff)

# 重複を削除
df.drop_duplicates(inplace=True)

In [0]:
# df = df.astype({"検査実施人数": int, "陰性人数": int, "陽性人数": int, "一般相談件数": int, "帰国者相談件数": int, "備考": str})

In [26]:
df.tail(10)

Unnamed: 0_level_0,検査実施人数,陰性人数,陽性人数,一般相談件数,帰国者相談件数,備考
年月日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-24,4,4,0,72,7,
2020-03-25,2,2,0,87,8,
2020-03-26,1,1,0,73,9,
2020-03-27,3,3,0,131,12,
2020-03-28,6,6,0,58,2,
2020-03-29,0,0,0,40,5,
2020-03-30,4,3,1,217,16,
2020-03-31,14,11,3,446,12,
2020-04-01,25,22,3,781,37,
2020-04-02,39,38,1,601,25,


In [0]:
from google.colab import files

In [0]:
if dt_kensa == dt_ippan == dt_kikoku:
    df.to_csv("toyama_counts.csv", encoding="utf_8_sig")
    files.download("toyama_counts.csv")
else:
    print("日付が違います")