<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/ibaraki/ibaraki_covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber

In [None]:
import datetime
import pathlib
import re
from urllib.parse import urljoin

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
import pdfplumber
import pandas as pd

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [None]:
def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

In [None]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)
    
    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

In [None]:
def pdf2df(p):

    with pdfplumber.open(p) as pdf:

        dfs = []

        for page in pdf.pages[1:]:

            table = page.extract_table()

            tmp = pd.DataFrame(table[1:], columns=table[0])

            if (tmp.columns[0] == "判明日") or (tmp.columns[1] == "判明日"):

                dfs.append(tmp)

    df = pd.concat(dfs)

    df.replace(["―", "－", ""], pd.NA, inplace=True)
    df.dropna(how="all", inplace=True)

    return df.reset_index(drop=True)

In [None]:
def str2date(s):

    df = s.str.extract("(\d{1,2})月(\d{1,2})日").rename(columns={0: "month", 1: "day"}).fillna(0).astype(int)

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")

In [None]:
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST).replace(tzinfo=None)

In [None]:
url = "https://www.pref.ibaraki.jp/1saigai/2019-ncov/hassei.html"

In [None]:
soup = fetch_soup(url)

# 茨城県

In [None]:
tag_pref = soup.find("a", class_="icon_pdf", text=re.compile("^新型コロナウイルス感染症患者の発生及び退院・退所等について"))
link_pref = urljoin(url, tag_pref.get("href"))

In [None]:
path_pref = fetch_file(link_pref)

In [None]:
df_pref = pdf2df(path_pref)

In [None]:
pub_pref = tag_pref.parent.find_previous_sibling("h4").get_text(strip=True).replace("発表", "")

In [None]:
df_pref["管轄"] = "茨城県"
df_pref["公表日"] = pub_pref

In [None]:
df_pref

# 水戸市

In [None]:
tag_city = soup.find("a", class_="icon_pdf", text=re.compile("^【水戸市発表】新型コロナウイルス感染症患者の発生について"))
link_city = urljoin(url, tag_city.get("href"))

In [None]:
path_city = fetch_file(link_city)

In [None]:
df_city = pdf2df(path_city)

In [None]:
pub_city = tag_city.parent.find_previous_sibling("h4").get_text(strip=True).replace("発表", "")

In [None]:
df_city["管轄"] = "水戸市"
df_city["公表日"] = pub_city

In [None]:
df_city

# 結合

In [None]:
df = pd.concat([df_pref, df_city])

In [None]:
df

# 前処理

In [None]:
# 無症状
df["状態"] = df["発症日"].where(df["発症日"] == "症状なし").replace({"症状なし": "無症状"})

In [None]:
df["年代"] = df["年代"].str.replace("歳代", "代")

In [None]:
df["性別"] = df["性別"].replace({"男子": "男性", "女子": "女性"})

In [None]:
df["職業"] = df["職業"].replace({"生徒": "学生", "非公表": ""})

In [None]:
df["患者_濃厚接触者フラグ"] = df["新規\n濃厚"].replace({"新規": 0, "濃厚": 1})

In [None]:
df["判明日"] = str2date(df["判明日"])
df["発症日"] = str2date(df["発症日"])
df["公表日"] = str2date(df["公表日"])

In [None]:
df["判明日ISO"] = df["判明日"].apply(lambda d: pd.Timestamp(d, tz=None).isoformat()).replace("NaT", "")
df["発症日ISO"] = df["発症日"].apply(lambda d: pd.Timestamp(d, tz=None).isoformat()).replace("NaT", "")
df["公表日ISO"] = df["公表日"].apply(lambda d: pd.Timestamp(d, tz=None).isoformat()).replace("NaT", "")

In [None]:
df["全国地方公共団体コード"] = "080004"
df["都道府県名"] = "茨城県"

In [None]:
df["市区町村名"] = df["管轄"].where(df["管轄"] == "水戸市")
df["全国地方公共団体コード"] = df["全国地方公共団体コード"].mask(df["管轄"] == "水戸市", "082015")

In [None]:
df.rename(
    columns={
        "公表日ISO": "公表_年月日",
        "発症日ISO": "発症_年月日",
        "居住地": "患者_居住地",
        "年代": "患者_年代",
        "性別": "患者_性別",
        "職業": "患者_職業",
        "状態": "患者_状態",
        "備考（疑われる感染経路）": "備考",
    },
    inplace=True,
)

In [None]:
df["備考"] = df["備考"].str.replace("、", "感染;") + "感染"

In [None]:
df1 = df.reset_index().sort_values(by=["公表日", "管轄", "index"]).reset_index(drop=True)

In [None]:
df1 = df1.reindex(
    [
        "全国地方公共団体コード",
        "都道府県名",
        "市区町村名",
        "公表_年月日",
        "発症_年月日",
        "患者_居住地",
        "患者_年代",
        "患者_性別",
        "患者_職業",
        "患者_状態",
        "患者_症状",
        "患者_渡航歴の有無フラグ",
        "患者_濃厚接触者フラグ",
        "検査方法",
        "備考",
    ],
    axis=1,
)

In [None]:
df1

In [None]:
df1.to_csv("080004_ibaraki_covid19_patients.csv", encoding="utf_8_sig")

In [None]:
df1.to_csv("080004_ibaraki_covid19_patients.tsv", sep="\t", encoding="utf_8_sig")

In [None]:
from google.colab import files

In [None]:
files.download("080004_ibaraki_covid19_patients.tsv")