<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/%E6%B2%96%E7%B8%84%E5%B8%82%E5%A0%B4%E6%9C%88%E5%A0%B1%E4%B8%80%E6%8B%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber



In [2]:
import pathlib

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup

In [3]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p

In [4]:
def snap_adjustment(s, limit=5):

    count = s.value_counts().sort_index()

    index = 0
    value = 0

    for i, v in count.items():

        if (i - index) < limit:

            if v > value:
                s = s.replace(index, i)
                index = i
                value = v

            else:
                s = s.replace(i, index)

        else:
            index = i
            value = v

    return s

In [5]:
links = [
#"https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/201901.pdf",
#"https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/201902.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou3103.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou3104.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0105.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0106.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0107.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0108.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0109.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0110.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0111.pdf",
    "https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0112.pdf",
]

# ディレクトリ名と範囲を指定

+ yasai
[6:10]

+ kudamono
[10:13]

+ kiribana
[29:32]

+ hachimono
[32:35]

In [6]:
# ディレクトリ名
DATA_DIR = "kiribana"

In [7]:
for link in links:

    print(link)

    path_pdf = fetch_file(link, DATA_DIR)

    with pdfplumber.open(path_pdf) as pdf:

        dfs = []
        flag = False

        # ページ範囲指定（0スタート）
        for page in pdf.pages[29:32]:

            if page.lines:

                crop = page.within_bbox((0, page.lines[0]["top"] - 2, page.width, page.lines[-1]["top"] + 2))

                df_tmp = (
                    pd.DataFrame(crop.extract_words(keep_blank_chars=True))
                    .astype({"x0": float, "x1": float, "top": float, "bottom": float})
                    .sort_values(["top", "x0"])
                )

                df_tmp["top"] = snap_adjustment(df_tmp["top"])
                df_tmp["page"] = page.page_number

                dfs.append(df_tmp)

        if dfs:

            df = pd.concat(dfs)

            # テキスト処理
            df["text"] = df["text"].str.replace("　", "").str.replace(",", "")

            df["center"] = df.loc[:, ["x0", "x1"]].median(axis=1)

            # 中央基準

            df["center"] = snap_adjustment(df["center"], 25)

            table = (
                df.pivot_table(
                    index=["page", "top"],
                    columns="center",   # 基準を指定："x0", "x1", "center"
                    values="text",
                    aggfunc=lambda x: "".join(str(v) for v in x),
                )
            ).values

            df1 = pd.DataFrame(table).dropna(thresh=2).dropna(how="all", axis=1)

            # 前処理

            df2 = df1[df1[0] != "品目"].copy()

            df2[0] = df2[0].fillna(method="ffill")

            df_even = (
                df2[::2]
                .set_axis(
                    [
                        "品目",
                        "県内_数量",
                        "県内_単価",
                        "県外_数量",
                        "県外_単価",
                        "外国_数量",
                        "外国_単価",
                        "総計_数量",
                        "総計_単価",
                    ],
                    axis=1,
                )
                .dropna(how="all", axis=1)
            )

            df_odd = (
                df2[1::2]
                .set_axis(
                    [
                        "品目",
                        "県内_金額",
                        "県内_単価",
                        "県外_金額",
                        "県外_単価",
                        "外国_金額",
                        "外国_単価",
                        "総計_金額",
                        "総計_単価",
                    ],
                    axis=1,
                )
                .dropna(how="all", axis=1)
            )

            df3 = (
                pd.merge(df_even, df_odd, on="品目")
                .set_index("品目")
                .reindex(
                    columns=[
                        "県内_数量",
                        "県内_単価",
                        "県内_金額",
                        "県外_数量",
                        "県外_単価",
                        "県外_金額",
                        "外国_数量",
                        "外国_単価",
                        "外国_金額",
                        "総計_数量",
                        "総計_単価",
                        "総計_金額",
                    ]
                )
            )

            path_csv = path_pdf.with_suffix(".csv")

            df3.to_csv(path_csv, encoding="utf_8_sig")
        else:
            print("error!")

https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou3103.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou3104.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0105.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0106.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0107.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0108.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0109.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0110.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0111.pdf
https://www.pref.okinawa.jp/site/norin/oroshiuri/documents/documents/geppou0112.pdf


In [8]:
!rm $DATA_DIR/*.pdf

In [9]:
!zip -r data.zip $DATA_DIR

  adding: kiribana/ (stored 0%)
  adding: kiribana/geppou0110.csv (deflated 52%)
  adding: kiribana/geppou3104.csv (deflated 50%)
  adding: kiribana/geppou0111.csv (deflated 52%)
  adding: kiribana/geppou0106.csv (deflated 51%)
  adding: kiribana/geppou0109.csv (deflated 52%)
  adding: kiribana/geppou0105.csv (deflated 51%)
  adding: kiribana/geppou0112.csv (deflated 51%)
  adding: kiribana/geppou0108.csv (deflated 52%)
  adding: kiribana/geppou3103.csv (deflated 50%)
  adding: kiribana/geppou0107.csv (deflated 51%)
