<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/toyama/toyama_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt install python3-tk ghostscript
!pip install camelot-py[cv]
!pip install jaconv

# スクレイピング

In [0]:
import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

import camelot
import jaconv

In [0]:
def date_conv(sr):

    df = sr.str.extract("(\d{1,2})[月/](\d{1,2})日?(.)", expand=True)

    df.rename(columns={0: "月", 1: "日", 2: "曜日"}, inplace=True)

    df["月"] = df["月"].astype(int)
    df["日"] = df["日"].astype(int)

    df["date"] = df.apply(lambda x: pd.Timestamp(year=dt_now.year, month=x["月"], day=x["日"]), axis=1)

    return df["date"]

In [0]:
url = "http://www.pref.toyama.jp/cms_sec/1205/kj00021629.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
dt_now = datetime.datetime.now()

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

In [0]:
file_list = soup.find("div", id="file")

# データラングリング

## 新型コロナウイルス感染症にかかる相談状況

In [0]:
pdf_ippan = file_list.find("a", text="新型コロナウイルス感染症にかかる相談状況").get("href")

!wget $pdf_ippan -O ippan.pdf

In [0]:
tb_ippan = camelot.read_pdf("ippan.pdf", pages="all", split_text=True, strip_text="\n", line_scale=40)
df_ippan = tb_ippan[1].df.iloc[:2, 3:-1].T.copy().reset_index(drop=True)

In [0]:
df_ippan.rename(columns={0: "日付", 1: "一般相談件数"}, inplace=True)
df_ippan["日付"] = date_conv(df_ippan["日付"])
df_ippan.set_index("日付", inplace=True)
df_ippan = df_ippan.astype(int)

df_ippan

## 帰国者・接触者相談センター相談件数

In [0]:
pdf_kikoku = file_list.find("a", text="帰国者・接触者相談センター相談件数").get("href")

!wget $pdf_kikoku -O kikoku.pdf

In [0]:
tb_kikoku = camelot.read_pdf("kikoku.pdf", pages="all", split_text=True, strip_text="\n", line_scale=40)
df_kikoku = tb_kikoku[0].df.iloc[1:, :].copy()

In [0]:
df_kikoku.rename(columns={0:"日付", 1:"帰国者相談件数", 2:"帰国者相談累計"}, inplace=True)
df_kikoku["日付"] = date_conv(df_kikoku["日付"])
df_kikoku.set_index("日付", inplace=True)
df_kikoku = df_kikoku.astype(int)

df_kikoku

## 新型コロナウイルスPCR検査件数

In [0]:
pdf_kensa = file_list.find("a", text="新型コロナウイルスPCR検査件数").get("href")

!wget $pdf_kensa -O kensa.pdf

In [0]:
tb_kensa = camelot.read_pdf("kensa.pdf", pages="all", split_text=True, strip_text="\n", line_scale=40)
df_kensa = tb_kensa[0].df.iloc[1:, :].copy()

In [0]:
df_kensa.rename(columns={0:"日付", 1:"検査実施人数", 2:"検査実施累計"}, inplace=True)
df_kensa["日付"] = date_conv(df_kensa["日付"])
df_kensa.set_index("日付", inplace=True)
df_kensa = df_kensa.astype(int)

df_kensa

## 陽性患者

In [0]:
df_kanja = pd.read_html(
    "http://www.pref.toyama.jp/cms_sec/1205/kj00021798.html", index_col=0, na_values="〃"
)[0]

In [0]:
df_kanja.index

In [0]:
df_kanja["発表日"] = (
    df_kanja["発表日"]
    .fillna("")
    .apply(lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True))
)

In [0]:
df_date = df_kanja["発表日"].str.extract(r"(令和(\d{1,2})年)?(\d{1,2})月(\d{1,2})日$", expand=True)

# 和暦列を削除
df_date = df_date.drop(columns=0).fillna(method="ffill").astype(int)

df_date.rename(columns={1: "year", 2: "month", 3: "day"}, inplace=True)

df_date["year"] += 2018

In [0]:
df_kanja["陽性人数"] = pd.to_datetime(df_date)
df_pats = df_kanja["陽性人数"].value_counts().sort_index().asfreq("D", fill_value=0)

## 結合

In [0]:
df = pd.concat([df_kensa, df_ippan, df_kikoku, df_pats], axis=1)
df

In [0]:
# 欠損値のある行は削除
df.dropna(how="any", inplace=True)

In [0]:
df = df.astype(int)

In [0]:
df["陰性人数"] = df["検査実施人数"] - df["陽性人数"]

In [0]:
df_data = df.loc[:, ["検査実施人数", "陰性人数", "陽性人数", "一般相談件数", "帰国者相談件数"]].copy()

In [0]:
df_ori = pd.read_csv("http://opendata.pref.toyama.jp/files/covid19/20200403/toyama_counts.csv", index_col="年月日", parse_dates=True, dtype={"備考": object})

In [0]:
df_csv = df_ori.append(df_data).groupby(level=0).last()

In [0]:
df_csv["退院者数"] = df_csv["退院者数"].astype("Int64")

In [0]:
df_csv.to_csv("toyama_counts.csv", encoding="utf_8_sig")

In [0]:
df_csv

# ダウンロード

In [0]:
from google.colab import files

In [0]:
files.download("toyama_counts.csv")