<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/kumamoto_gomi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber

# スクレイピング

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
import urllib.parse

In [None]:
import pathlib

In [None]:
import pandas as pd

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [None]:
def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

In [None]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

In [None]:
url = "https://www.city.kumamoto.jp/kankyo/hpKiji/pub/detail.aspx?c_id=5&id=4637&class_set_id=20&class_id=2682"

In [None]:
soup = fetch_soup(url)

In [None]:
links = []

for tag in soup.select("table.__wys_table > tbody > tr > td > a"):

    d = {}

    d["no"] = int(tag.parent.find_previous_sibling("td").get_text(strip=True))
    d["area"] = tag.get_text(strip=True)
    d["link"] = tag.get("href")
    d["pdf"] = fetch_soup(d["link"]).find("a", text="全ページ一括ダウンロード").get("href")

    qs = urllib.parse.urlparse(d["pdf"]).query
    d["id"] = urllib.parse.parse_qs(qs)["id"][0]

    links.append(d)

In [None]:
pd.DataFrame(links)

# データラングリング

In [None]:
import io
import re

In [None]:
import pdfplumber

In [None]:
from tqdm.notebook import tqdm

In [None]:
def make_cal(se0, year, n):

    n += 3

    y, m = divmod(n, 12)

    year += y
    month = m + 1

    df0 = se0.str.split(expand=True).reset_index(drop=True)
    df1 = df0[df0.isin(days + kind)].copy().dropna(how="all")

    df2 = df1.apply(lambda x: x.dropna().reset_index(drop=True), axis=1)

    s0 = df2.to_csv(index=False, header=False)
    s1 = re.sub(",(29|30|31)", r"\n\1", s0)

    df3 = (
        pd.read_csv(io.StringIO(s1), header=None, index_col=0)
        .dropna(how="all", axis=1)
        .dropna(how="all")
        .fillna("")
        .sort_index()
    )

    df4 = (
        df3[1]
        .str.cat(df3[2], sep="・")
        .str.strip("・")
        .reset_index()
        .rename(columns={0: "day", 1: "kind"})
    )

    df4["year"] = year
    df4["month"] = month

    df4["date"] = pd.to_datetime(df4[["year", "month", "day"]])

    df4.set_index("date", inplace=True)

    return df4["kind"]

In [None]:
def fetch_pdf(link):

    dfs = []
    n = 0

    p = fetch_file(link)

    pdf = pdfplumber.open(p)

    for i in range(1, 4):

        page = pdf.pages[i]

        for bbox in bboxs:

            crop = page.within_bbox(bbox)

            vertical = list(map(lambda x: x + bbox[0], tate))
            horizontal = list(map(lambda x: x + bbox[1], yoko))

            table_settings = {
                "vertical_strategy": "explicit",
                "explicit_vertical_lines": vertical,
                "horizontal_strategy": "explicit",
                "explicit_horizontal_lines": horizontal,
            }

            se_tmp = (
                pd.DataFrame(crop.extract_table(table_settings))
                .stack()
                .str.replace("日", "日 ")
                .str.replace("(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)", r"\1 \2")
            )

            se = make_cal(se_tmp, 2021, n)

            dfs.append(se)
            n += 1

    df0 = pd.concat(dfs)

    df1 = (
        df0.reindex(dt_range, fill_value="収集なし")
        .reset_index()
        .rename({"index": "収集日", "kind": "収集区分"}, axis=1)
    )

    return df1

In [None]:
tate = [0, 32, 94, 155, 216, 277, 339, 400]
yoko = [23, 90, 158, 226, 292, 360]

In [None]:
bboxs = [
    [18, 154, 419, 516],
    [424, 154, 825, 516],
    [18, 693, 419, 1056],
    [424, 693, 825, 1056],
]

In [None]:
days = list(map(str, range(1, 32)))

# ごみの種類
kind = ["燃やすごみ", "紙", "プラ容器包装", "資源物", "ペットボトル", "特定品目", "埋立ごみ"]

In [None]:
# 日付範囲
dt_range = pd.date_range(start="2021-04-01", end="2022-03-31")

In [None]:
dfs = []

for link in tqdm(links):

    df_tmp = fetch_pdf(link["pdf"])

    df_tmp.insert(0, "収集地区ID", link["id"])

    dfs.append(df_tmp)

In [None]:
df = pd.concat(dfs)

In [None]:
df

In [None]:
df.to_csv("kumamoto.csv", index=False)