<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/mynumbar_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install python3-tk ghostscript
!pip install camelot-py[cv]

!pip install japanera
!pip install japanmap

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ghostscript is already the newest version (9.26~dfsg+0-0ubuntu0.18.04.12).
python3-tk is already the newest version (3.6.9-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
import csv
import datetime
import pathlib
import re
import shutil
import time
from urllib.parse import urljoin

In [None]:
import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from japanera import EraDate, EraDateTime, Japanera
from japanmap import pref_code
from tqdm.notebook import tqdm

In [None]:
OUT_DIR = "download"
CSV_DIR = "csv"

In [None]:
def mynumber_pdf(tag):

    if tag.name == "a":
        text = tag.get_text(strip=True)

        if text.startswith("マイナンバーカード交付状況"):
            href = tag.get("href")
            if href.endswith(".pdf"):
                return True

    return False

In [None]:
def wareki2date(s):

    m = re.search("(H|R|平成|令和)([0-9元]{1,2})[.年]([0-9]{1,2})[.月]([0-9]{1,2})日?", s)

    year, month, day = [1 if i == "元" else int(i) for i in m.group(2, 3, 4)]

    janera = Japanera()

    wareki = m.group(1)

    if m.group(1) in "HR":
        temp = sorted(
            janera.era_match(m.group(1), lambda x: x.english_head, lambda x, y: x == y)
        )[-1]
        wareki = temp.kanji

    ws = f"{wareki}{year:02}年{month:02}月{day:02}日"
    dt = janera.strptime(ws, r"%-E%-o年%m月%d日")[0]

    return dt.date()

In [None]:
def get_pdf(url, file_name):

    r = requests.get(url)

    p = pathlib.Path(OUT_DIR, file_name + ".pdf")
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)

    return p

In [None]:
if __name__ == "__main__":

    url = "https://www.soumu.go.jp/kojinbango_card/"

    cjk = str.maketrans("⻲⻑黑戶⻯⻄⻘⻤", "亀長黒戸竜西青鬼")
    # cjk = str.maketrans("⺟⺠⻁⻄⻑⻘⻝⻤⻨⻩⻫⻭⻯⻲戶黑", "母民虎西長青食鬼麦黄斉歯竜亀戸黒")

    # PDFファイルをダウンロード
    r = requests.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "html.parser")
    tags = soup.find_all(mynumber_pdf)

    for tag in tqdm(tags):

        link = urljoin(url, tag.get("href"))
        dt_now = wareki2date(tag.get_text(strip=True))

        print(dt_now)

        pdf = get_pdf(link, dt_now.isoformat())

        tables = camelot.read_pdf(
            str(pdf), pages="all", split_text=True, strip_text="\n", line_scale=40
        )

        # 団体区分別

        dt_jinkou = wareki2date(tables[0].df.iat[0, 2])
        dt_koufu = wareki2date(tables[0].df.iat[0, 3])

        df_summary = tables[0].df.iloc[1:].copy()

        df_summary.columns = ["区分", "", "人口", "交付枚数", "人口に対する交付枚数率"]

        df_summary["区分"] = df_summary["区分"] + df_summary[""]

        df_summary["人口"] = df_summary["人口"].str.replace(",", "").astype(int)
        df_summary["交付枚数"] = df_summary["交付枚数"].str.replace(",", "").astype(int)
        df_summary["人口に対する交付枚数率"] = (
            df_summary["人口に対する交付枚数率"].str.rstrip("%％").astype(float)
        )

        df_summary["人口算出基準日"] = dt_jinkou.strftime("%Y/%m/%d")
        df_summary["交付件数基準日"] = dt_koufu.strftime("%Y/%m/%d")
        df_summary["公開日"] = dt_now.strftime("%Y/%m/%d")

        df_summary = df_summary.reindex(
            columns=["公開日", "区分", "人口", "交付枚数", "人口に対する交付枚数率", "人口算出基準日", "交付件数基準日"]
        )

        print("団体区分別", df_summary.isnull().values.sum())

        p1 = pathlib.Path(CSV_DIR, dt_now.isoformat(), "summary_by_types.csv")
        p1.parent.mkdir(parents=True, exist_ok=True)
        df_summary.to_csv(
            str(p1), index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig"
        )

        # 都道府県一覧

        dt_jinkou = wareki2date(tables[3].df.iat[0, 1])
        dt_koufu = wareki2date(tables[3].df.iat[0, 2])

        df_pref = pd.concat([table.df.loc[1:] for table in tables[3:5]])

        df_pref.columns = ["都道府県名", "総数（人口）", "交付枚数", "交付率"]

        df_pref["都道府県名"] = df_pref["都道府県名"].str.normalize("NFKC")
        df_pref["都道府県名"] = df_pref["都道府県名"].apply(lambda s: s.translate(cjk))

        df_pref["総数（人口）"] = df_pref["総数（人口）"].str.replace(",", "").astype(int)
        df_pref["交付枚数"] = df_pref["交付枚数"].str.replace(",", "").astype(int)
        df_pref["交付率"] = df_pref["交付率"].str.rstrip("%％").astype(float)

        df_pref["人口算出基準日"] = dt_jinkou.strftime("%Y/%m/%d")
        df_pref["交付件数基準日"] = dt_koufu.strftime("%Y/%m/%d")

        df_pref["公開日"] = dt_now.strftime("%Y/%m/%d")

        df_pref = df_pref.reindex(
            columns=["公開日", "都道府県名", "総数（人口）", "交付枚数", "交付率", "人口算出基準日", "交付件数基準日"]
        )

        df_pref["コード"] = df_pref["都道府県名"].apply(lambda s: pref_code(s))
        df_pref = df_pref.set_index("コード").sort_index()

        print("都道府県一覧", df_pref.isnull().values.sum())

        p2 = pathlib.Path(CSV_DIR, dt_now.isoformat(), "all_prefectures.csv")
        df_pref.to_csv(
            str(p2), index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig"
        )

        # 男女・年齢別

        n = 5

        if dt_now > datetime.date(2017, 3, 8):

            n = 6

            dt_jinkou = wareki2date(tables[5].df.iat[0, 1])
            dt_koufu = wareki2date(tables[5].df.iat[0, 4])

            df_ages = tables[5].df.iloc[2:].copy()
            df_ages.columns = [
                "年齢",
                "人口(男)",
                "人口(女)",
                "人口(計)",
                "交付件数(男)",
                "交付件数(女)",
                "交付件数(計)",
                "交付率(男)",
                "交付率(女)",
                "交付率(計)",
                "全体に対する交付件数割合(男)",
                "全体に対する交付件数割合(女)",
                "全体に対する交付件数割合(計)",
            ]

            df_ages = df_ages.applymap(lambda s: s.rstrip("%％").replace(",", ""))

            df_ages["人口算出基準日"] = dt_jinkou.strftime("%Y/%m/%d")
            df_ages["交付件数基準日"] = dt_koufu.strftime("%Y/%m/%d")

            df_ages["公開日"] = dt_now.strftime("%Y/%m/%d")

            df_ages = df_ages.astype(
                {
                    "人口(男)": int,
                    "人口(女)": int,
                    "人口(計)": int,
                    "交付件数(男)": int,
                    "交付件数(女)": int,
                    "交付件数(計)": int,
                    "交付率(男)": float,
                    "交付率(女)": float,
                    "交付率(計)": float,
                    "全体に対する交付件数割合(男)": float,
                    "全体に対する交付件数割合(女)": float,
                    "全体に対する交付件数割合(計)": float,
                }
            )

            df_ages = df_ages.reindex(
                columns=[
                    "公開日",
                    "年齢",
                    "人口(男)",
                    "人口(女)",
                    "人口(計)",
                    "交付件数(男)",
                    "交付件数(女)",
                    "交付件数(計)",
                    "交付率(男)",
                    "交付率(女)",
                    "交付率(計)",
                    "全体に対する交付件数割合(男)",
                    "全体に対する交付件数割合(女)",
                    "全体に対する交付件数割合(計)",
                    "人口算出基準日",
                    "交付件数基準日",
                ]
            )

            print("男女・年齢別", df_ages.isnull().values.sum())

            p3 = pathlib.Path(CSV_DIR, dt_now.isoformat(), "demographics.csv")
            df_ages.to_csv(
                str(p3), index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig"
            )

        # 市区町村別一覧

        dt_jinkou = wareki2date(tables[n].df.iat[0, 2])
        dt_koufu = wareki2date(tables[n].df.iat[0, 3])

        df_local = pd.concat([table.df.iloc[1:] for table in tables[n:]])

        df_local.columns = ["都道府県名", "市区町村名", "総数（人口）", "交付枚数", "交付率"]

        # 全国を削除
        # df_local.drop_duplicates(keep=False, inplace=True)
        df_local = df_local[df_local["都道府県名"] != "全国"]

        df_local["都道府県名"] = df_local["都道府県名"].str.normalize("NFKC")
        df_local["市区町村名"] = df_local["市区町村名"].str.normalize("NFKC")

        df_local["都道府県名"] = df_local["都道府県名"].apply(lambda s: s.translate(cjk))
        df_local["市区町村名"] = df_local["市区町村名"].apply(lambda s: s.translate(cjk))

        df_local["市区町村名"] = df_local["市区町村名"].str.replace("\s", "")

        df_local["地名"] = df_local["都道府県名"] + df_local["市区町村名"]

        #  
        df_local["市区町村名"] = df_local["市区町村名"].mask(df_local["地名"] == "兵庫県篠山市", "丹波篠山市")
        df_local["市区町村名"] = df_local["市区町村名"].mask(df_local["地名"] == "高知県高岡郡梼原町", "高岡郡檮原町")
        df_local["市区町村名"] = df_local["市区町村名"].mask(df_local["地名"] == "福岡県糟屋郡須惠町", "糟屋郡須恵町")

        if dt_now < datetime.date(2018, 10, 1):
            df_local["市区町村名"] = df_local["市区町村名"].mask(df_local["地名"] == "福岡県那珂川市", "筑紫郡那珂川町")
        else:
            df_local["市区町村名"] = df_local["市区町村名"].mask(df_local["地名"] == "福岡県筑紫郡那珂川町", "那珂川市")

        df_local["総数（人口）"] = df_local["総数（人口）"].str.replace(",", "").astype(int)
        df_local["交付枚数"] = df_local["交付枚数"].str.replace(",", "").astype(int)
        df_local["交付率"] = df_local["交付率"].str.rstrip("%％").astype(float)

        df_local["人口算出基準日"] = dt_jinkou.strftime("%Y/%m/%d")
        df_local["交付件数基準日"] = dt_koufu.strftime("%Y/%m/%d")

        df_local["公開日"] = dt_now.strftime("%Y/%m/%d")

        df_local = df_local.reindex(
            columns=[
                "公開日",
                "都道府県名",
                "市区町村名",
                "総数（人口）",
                "交付枚数",
                "交付率",
                "人口算出基準日",
                "交付件数基準日",
            ]
        )

        # 団体コード追加
        df_code = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSseDxB5f3nS-YQ1NOkuFKZ7rTNfPLHqTKaSag-qaK25EWLcSL0klbFBZm1b6JDKGtHTk6iMUxsXpxt/pub?gid=0&single=true&output=csv")

        df_code["市区町村名"] = df_code["郡名"].fillna("") + df_code["市区町村名"]
        df_code.drop("郡名", axis=1, inplace=True)

        df_local = pd.merge(df_local, df_code, on=['都道府県名', '市区町村名'], how="left")
        df_local["団体コード"] = df_local["団体コード"].astype("Int64")

        print("市区町村別一覧", df_local.isnull().values.sum())

        p4 = pathlib.Path(CSV_DIR, dt_now.isoformat(), "all_localgovs.csv")
        df_local.to_csv(
            str(p4), index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig"
        )

        df_check = df_local.loc[:, ["都道府県名", "市区町村名"]].copy()

        p5 = pathlib.Path("name", f"{dt_now.isoformat()}.tsv")
        p5.parent.mkdir(parents=True, exist_ok=True)

        df_check.to_csv(str(p5), sep="\t", index=False)

        # time.sleep(1)

    shutil.make_archive("city", "zip", root_dir="./name")

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

2017-03-08


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
市区町村別一覧 1
2017-05-15


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 1
2017-08-31


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 1
2017-12-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 1
2018-03-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 1
2018-07-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 1
2018-12-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2019-04-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2019-07-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2019-09-16


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2019-11-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-01-20


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-03-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-04-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-05-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-06-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0
2020-07-01


  warn("There was error running cmp(key(Era), value) but skipped because ignore_error=True")


団体区分別 0
都道府県一覧 0
男女・年齢別 0
市区町村別一覧 0

