<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/%E7%AD%89%E3%80%85%E5%8A%9B%E9%99%B8%E4%B8%8A%E7%AB%B6%E6%8A%80%E5%A0%B4%E5%88%A9%E7%94%A8%E4%BA%88%E5%AE%9A%E8%A1%A8CSV%E5%A4%89%E6%8F%9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# インストール

In [26]:
!pip install pdfplumber
!pip install japanera

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# スクレイピング

In [27]:
import requests
from bs4 import BeautifulSoup

In [28]:
from urllib.parse import urljoin

In [29]:
import pathlib

In [30]:
def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

In [31]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

In [32]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [33]:
url = "https://www.city.kawasaki.jp/nakahara/"

In [34]:
link = urljoin(url, "./page/0000088519.html")

In [35]:
soup = fetch_soup(link)

In [36]:
pdf_links = []

for i in soup.find("h2", text="等々力陸上競技場利用予定表").find_next_sibling("div", class_="mol_attachfileblock").select("ul > li > a"):

    d = {}

    d["link"] = urljoin(url, i.get("href"))
    d["text"] = i.get_text(strip=True)

    pdf_links.append(d)

In [37]:
pdf_links

[{'link': 'https://www.city.kawasaki.jp/nakahara/cmsfiles/contents/0000088/88519/202210.pdf',
  'text': '令和4年10月分（9月20日更新）(PDF形式, 95.58KB)'},
 {'link': 'https://www.city.kawasaki.jp/nakahara/cmsfiles/contents/0000088/88519/202211.pdf',
  'text': '令和4年11月分（10月21日更新）(PDF形式, 93.21KB)'}]

In [38]:
# 最新
p = fetch_file(pdf_links[-1]["link"])

# 日付

In [39]:
import re

In [40]:
from datetime import date
from japanera import (Japanera, EraDate, EraDateTime)

In [41]:
s = pdf_links[-1]["text"]
m = re.match("令和(\d{1,2})年(\d{1,2})月", s)

In [42]:
janera = Japanera()

In [43]:
dt_date = janera.strptime(m.group(0), "%-E%-O年%m月")[0]

In [44]:
dt_date

datetime.datetime(2022, 11, 1, 0, 0)

# PDF

In [45]:
import pdfplumber
import pandas as pd

In [46]:
with pdfplumber.open(p) as pdf:

    dfs = []

    for page in pdf.pages:

        for table in  page.extract_tables():

            df = pd.DataFrame(table[2:], columns=["day", "曜日", "大会名", "午前", "午後"])

            df["year"] = dt_date.year
            df["month"] = dt_date.month
            df["day"] = df["day"].astype(int)

            df["日付"] = pd.to_datetime(df[["year", "month", "day"]])

            df["大会名"].mask(df["大会名"] == "", inplace=True)
            df["午後"].mask(df["午後"].isna(), df["午前"], inplace=True)

            dfs.append(df.reindex(["日付", "曜日", "大会名", "午前", "午後"], axis=1))

In [47]:
# 陸上競技場（メイン）
dfs[0]

Unnamed: 0,日付,曜日,大会名,午前,午後
0,2022-11-01,火,中学校駅伝大会（予備日）,×,×
1,2022-11-02,水,,○,○
2,2022-11-03,木,,○,○
3,2022-11-04,金,,○,○
4,2022-11-05,土,高校サッカー県予選　準決勝,×,×
5,2022-11-06,日,ラグビースクール,×,×
6,2022-11-07,月,休場日,,
7,2022-11-08,火,,○,○
8,2022-11-09,水,,○,○
9,2022-11-10,木,,○,○


In [48]:
dfs[0].to_csv(f"{dt_date:%Y%m%d}_main.csv", encoding="utf_8_sig", index=False)

In [49]:
# 陸上競技場（補助）
dfs[1]

Unnamed: 0,日付,曜日,大会名,午前,午後
0,2022-11-01,火,,○,○
1,2022-11-02,水,,○,○
2,2022-11-03,木,AMラグビースクール,×,〇
3,2022-11-04,金,,○,○
4,2022-11-05,土,,○,○
5,2022-11-06,日,,○,○
6,2022-11-07,月,休場日,,
7,2022-11-08,火,,○,○
8,2022-11-09,水,,○,○
9,2022-11-10,木,,○,○


In [50]:
dfs[1].to_csv(f"{dt_date:%Y%m%d}_sub.csv", encoding="utf_8_sig", index=False)