<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/ehime/ehime_covid_tsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber

Collecting pdfplumber
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/4d9768e9ed204c68bd5813a2a112d3d6af4912f0785d47080b5067cdce64/pdfplumber-0.5.27.tar.gz (44kB)
[K     |████████████████████████████████| 51kB 2.7MB/s 
[?25hCollecting pdfminer.six==20200517
[?25l  Downloading https://files.pythonhosted.org/packages/b0/c0/ef1c8758bbd86edb10b5443700aac97d0ba27a9ca2e7696db8cd1fdbd5a8/pdfminer.six-20200517-py3-none-any.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 5.2MB/s 
Collecting Wand
[?25l  Downloading https://files.pythonhosted.org/packages/d7/f6/05f043c099639b9017b7244791048a4d146dfea45b41a199aed373246d50/Wand-0.6.6-py2.py3-none-any.whl (138kB)
[K     |████████████████████████████████| 143kB 49.2MB/s 
Collecting pycryptodome
[?25l  Downloading https://files.pythonhosted.org/packages/ad/16/9627ab0493894a11c68e46000dbcc82f578c8ff06bc2980dcd016aea9bd3/pycryptodome-3.10.1-cp35-abi3-manylinux2010_x86_64.whl (1.9MB)
[K     |█████████████████████████

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
from urllib.parse import urljoin
import pathlib

In [4]:
import pdfplumber
import pandas as pd

# スクレイピング

In [5]:
url = "https://www.pref.ehime.jp/h25500/kansen/covid19.html"
# url = "https://www.pref.ehime.jp/h25500/kansen/covid19/kansensya-kako.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [6]:
r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

In [7]:
tags = [i for i in soup.select("div#tmp_contents > ul > li > a") if "新型コロナウイルスの感染の確認等について" in i.get_text(strip=True)]

# ダウンロード

In [8]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if p.exists():
        print(f"{p}\t同一のファイルが存在するためダウンロードを中止します")

    else:
        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)
    return p

In [9]:
def find_cluster(data):

    for d in data:

        s = d.get("text", "")

        if s.startswith("○クラスターの状況"):

            return d.get("top")
    
    return None

In [10]:
def data_conv(ser: pd.Series, col: str) -> pd.Series:

    df = ser.str.split("\n+", expand=True).T[0].str.split("：", expand=True).rename(columns={0: col, 1: "人数"})

    df[col] = df[col].str.strip().str.normalize("NFKC").str.replace("\s", "", regex=True)

    df["人数"] = df["人数"].str.strip().str.rstrip("名").str.normalize("NFKC").astype(int)

    return df.set_index(col)["人数"]

In [11]:
dfs_ages = []
dfs_area = []
dfs_sex = []

for tag in tags:

    link = urljoin(url, tag.get("href"))
    p = fetch_file(link, "download")

    name = p.stem

    pdf = pdfplumber.open(p)
    page = pdf.pages[0]

    top = find_cluster(page.extract_words())
    hight = top or page.height
    crop = page.within_bbox((0, 80, page.width, hight))
 
    tables = sorted(crop.find_tables(), key=lambda t: t.bbox)
    table = tables[0].extract()

    tmp = pd.DataFrame(table[1:], columns=table[0])

    # 年代
    tmp_ages = data_conv(tmp["年代"], "年代")

    # 居住地
    tmp_area = data_conv(tmp["居住地"], "居住地")

    # 性別
    tmp_sex = tmp["性別"].str.extractall("([男|女]性)：(.+)名").rename(columns={0: "性別", 1: "人数"}).set_index("性別").astype(int)["人数"]

    tmp_ages.name = name
    tmp_area.name = name
    tmp_sex.name = name

    dfs_ages.append(tmp_ages)
    dfs_area.append(tmp_area)
    dfs_sex.append(tmp_sex)

In [12]:
df_ages = pd.concat(dfs_ages, axis=1).T.fillna(0).astype(int).sort_index().reindex(columns = ["10歳未満", "10代", "20代", "30代", "40代", "50代", "60代", "70代", "80代", "90代"])
df_area = pd.concat(dfs_area, axis=1).T.fillna(0).astype(int).sort_index()
df_sex = pd.concat(dfs_sex, axis=1).T.fillna(0).astype(int).sort_index()

In [13]:
df_ages.to_csv("ages.tsv", sep="\t")
df_area.to_csv("area.tsv", sep="\t")
df_sex.to_csv("sex.tsv", sep="\t")