<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/kanagawa/fujisawa_covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.6.0.tar.gz (46 kB)
[K     |████████████████████████████████| 46 kB 1.4 MB/s 
[?25hCollecting pdfminer.six==20211012
  Downloading pdfminer.six-20211012-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 9.1 MB/s 
[?25hCollecting Pillow>=8.4
  Downloading Pillow-9.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 32.5 MB/s 
[?25hCollecting Wand>=0.6.7
  Downloading Wand-0.6.7-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 39.1 MB/s 
Collecting cryptography
  Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 37.0 MB/s 
Building wheels for collected packages: pdfplumber
  Building wheel for pdfplumber (setup.py) ... [?25l[?25hdone
  Created wheel for pdfplumber: filename=pdfplumber-0.6.0-py3-none-any.whl size=33688 sha256=950f91e144

# スクレイピング

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
from urllib.parse import urljoin

In [4]:
import re

In [5]:
import pathlib

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [7]:
def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup

In [8]:
def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

In [9]:
url = "https://www.city.fujisawa.kanagawa.jp/hokenyobo/corona_doukou_data.html"

In [10]:
soup = fetch_soup(url)

In [11]:
tag = soup.find("a", href=re.compile(".pdf$"), text=re.compile("【令和３年１０月～】"))

In [12]:
link = urljoin(url, tag.get("href"))

In [13]:
p = fetch_file(link)

# PDFからCSV変換

In [14]:
import datetime

In [15]:
import pdfplumber
import pandas as pd

In [16]:
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST).replace(tzinfo=None)

In [17]:
with pdfplumber.open(p) as pdf:

    dfs = []

    for page in pdf.pages:

        table = page.extract_table()

        df_tmp = pd.DataFrame(table[1:], columns=table[0])

        dfs.append(df_tmp)

In [18]:
df0 = pd.concat(dfs).reset_index(drop=True)

In [19]:
df1 = df0.copy()

In [20]:
df1.rename(columns={None: "No"}, inplace=True)

In [21]:
df1["発表日"].fillna(method="ffill", inplace=True)

In [22]:
df_date = df1["発表日"].str.extract("(\d{4})年(\d{1,2})月(\d{1,2})日").astype(int).rename(columns={0: "year", 1:"month", 2:"day"})

In [23]:
df1["date"] = pd.to_datetime(df_date, errors="coerce")

In [24]:
# 7024が2022年10月15日になってるので修正
df_date["year"].mask(df1["date"] > dt_now, df_date["year"] - 1, inplace=True)

In [25]:
df1["date"] = pd.to_datetime(df_date, errors="coerce")

In [26]:
df1

Unnamed: 0,番号,発表日,No,年代,性別,居住地,職業,備考,date
0,6972,2021年10月1日,1,80代,女性,藤沢市,無職,,2021-10-01
1,6973,2021年10月1日,2,40代,男性,藤沢市,会社員,,2021-10-01
2,6974,2021年10月1日,3,20代,女性,藤沢市,学生,,2021-10-01
3,6975,2021年10月1日,4,20代,男性,藤沢市,介護職員,,2021-10-01
4,6976,2021年10月1日,5,20代,女性,藤沢市,会社員,,2021-10-01
...,...,...,...,...,...,...,...,...,...
15,21615,2022年3月6日,334,10代,男性,藤沢市,---,,2022-03-06
16,21616,2022年3月6日,335,40代,女性,藤沢市,自営業,,2022-03-06
17,21617,2022年3月6日,336,20代,女性,市外,学生,,2022-03-06
18,21618,2022年3月6日,337,90代,女性,藤沢市,無職,,2022-03-06


In [27]:
df1.to_csv("fujisawa.csv", encoding="utf_8_sig")

# ダウンロード

In [28]:
from google.colab import files

files.download("fujisawa.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>