<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/%E7%AD%89%E3%80%85%E5%8A%9B%E9%99%B8%E4%B8%8A%E7%AB%B6%E6%8A%80%E5%A0%B4%E5%88%A9%E7%94%A8%E4%BA%88%E5%AE%9A%E8%A1%A8CSV%E5%A4%89%E6%8F%9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# インストール

In [1]:
!pip install pdfplumber
!pip install japanera

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdfplumber
  Downloading pdfplumber-0.7.5-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 3.8 MB/s 
[?25hCollecting Wand>=0.6.10
  Downloading Wand-0.6.10-py2.py3-none-any.whl (142 kB)
[K     |████████████████████████████████| 142 kB 7.3 MB/s 
[?25hCollecting pdfminer.six==20220524
  Downloading pdfminer.six-20220524-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 18.3 MB/s 
[?25hCollecting Pillow>=9.1
  Downloading Pillow-9.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 34.2 MB/s 
Collecting cryptography>=36.0.0
  Downloading cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 38.4 MB/s 
Installing collected packages: cryptography, Wand, Pillow, pdfminer.six, pdfplumber
  Attempting un

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting japanera
  Downloading Japanera-0.1.1.tar.gz (19 kB)
Collecting kanjize
  Downloading kanjize-1.1.0-py3-none-any.whl (5.2 kB)
Building wheels for collected packages: japanera
  Building wheel for japanera (setup.py) ... [?25l[?25hdone
  Created wheel for japanera: filename=Japanera-0.1.1-py3-none-any.whl size=15897 sha256=0b46929652748d8f6658c3f2761f9079f55954fc763b4b5a2c77888d772ab99b
  Stored in directory: /root/.cache/pip/wheels/12/84/d2/b99f78f4a3fd1e8a4538d3fbe0cb47ece44c07e65f7a7f761e
Successfully built japanera
Installing collected packages: kanjize, japanera
Successfully installed japanera-0.1.1 kanjize-1.1.0


# スクレイピング

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
from urllib.parse import urljoin

In [4]:
import pathlib

In [5]:
def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

In [6]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

In [7]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [8]:
url = "https://www.city.kawasaki.jp/nakahara/"

In [9]:
link = urljoin(url, "./page/0000088519.html")

In [10]:
soup = fetch_soup(link)

In [11]:
pdf_links = []

for i in soup.find("h2", text="等々力陸上競技場利用予定表").find_next_sibling("div", class_="mol_attachfileblock").select("ul > li > a"):

    d = {}

    d["link"] = urljoin(url, i.get("href"))
    d["text"] = i.get_text(strip=True)

    pdf_links.append(d)

In [12]:
pdf_links

[{'link': 'https://www.city.kawasaki.jp/nakahara/cmsfiles/contents/0000088/88519/202210.pdf',
  'text': '令和4年10月分（9月20日更新）(PDF形式, 95.58KB)'},
 {'link': 'https://www.city.kawasaki.jp/nakahara/cmsfiles/contents/0000088/88519/202211.pdf',
  'text': '令和4年11月分（10月21日更新）(PDF形式, 93.21KB)'}]

In [13]:
# 最新
p = fetch_file(pdf_links[-1]["link"])

# 日付

In [14]:
import re

In [15]:
from datetime import date
from japanera import (Japanera, EraDate, EraDateTime)

In [16]:
s = pdf_links[-1]["text"]
m = re.match("令和(\d{1,2})年(\d{1,2})月", s)

In [17]:
janera = Japanera()

In [18]:
dt_date = janera.strptime(m.group(0), "%-E%-O年%m月")[0]

In [19]:
dt_date

datetime.datetime(2022, 11, 1, 0, 0)

# PDF

In [20]:
import pdfplumber
import pandas as pd

In [21]:
with pdfplumber.open(p) as pdf:

    dfs = []

    for page in pdf.pages:

        for table in  page.extract_tables():

            df = pd.DataFrame(table[2:], columns=["day", "曜日", "大会名", "午前", "午後"])

            df["year"] = dt_date.year
            df["month"] = dt_date.month
            df["day"] = df["day"].astype(int)

            df["日付"] = pd.to_datetime(df[["year", "month", "day"]])

            df["大会名"].mask(df["大会名"] == "", inplace=True)
            df[["午前", "午後"]] = df[["午前", "午後"]].fillna(method="ffill", axis=1)

            dfs.append(df.reindex(["日付", "曜日", "大会名", "午前", "午後"], axis=1))

In [22]:
# 陸上競技場（メイン）
dfs[0]

Unnamed: 0,日付,曜日,大会名,午前,午後
0,2022-11-01,火,中学校駅伝大会（予備日）,×,×
1,2022-11-02,水,,○,○
2,2022-11-03,木,,○,○
3,2022-11-04,金,,○,○
4,2022-11-05,土,高校サッカー県予選　準決勝,×,×
5,2022-11-06,日,ラグビースクール,×,×
6,2022-11-07,月,休場日,,
7,2022-11-08,火,,○,○
8,2022-11-09,水,,○,○
9,2022-11-10,木,,○,○


In [23]:
dfs[0].to_csv(f"{dt_date:%Y%m%d}_main.csv", encoding="utf_8_sig", index=False)

In [24]:
# 陸上競技場（補助）
dfs[1]

Unnamed: 0,日付,曜日,大会名,午前,午後
0,2022-11-01,火,,○,○
1,2022-11-02,水,,○,○
2,2022-11-03,木,AMラグビースクール,×,〇
3,2022-11-04,金,,○,○
4,2022-11-05,土,,○,○
5,2022-11-06,日,,○,○
6,2022-11-07,月,休場日,,
7,2022-11-08,火,,○,○
8,2022-11-09,水,,○,○
9,2022-11-10,木,,○,○


In [25]:
dfs[1].to_csv(f"{dt_date:%Y%m%d}_sub.csv", encoding="utf_8_sig", index=False)