<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/tokyo/tokyo_partients2csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber

Collecting pdfplumber
[?25l  Downloading https://files.pythonhosted.org/packages/8c/27/7e2723bfe422fa6af61c1bf05cc8d2bb4128e169ab8ffffc01f3cb1e8ace/pdfplumber-0.5.25.tar.gz (42kB)
[K     |████████████████████████████████| 51kB 2.7MB/s 
[?25hCollecting pdfminer.six==20200517
[?25l  Downloading https://files.pythonhosted.org/packages/b0/c0/ef1c8758bbd86edb10b5443700aac97d0ba27a9ca2e7696db8cd1fdbd5a8/pdfminer.six-20200517-py3-none-any.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 6.1MB/s 
Collecting Wand
[?25l  Downloading https://files.pythonhosted.org/packages/98/08/096b76e9211ca5ef338791100b76375555cb4082a53496b1c1d5897ee13c/Wand-0.6.5-py2.py3-none-any.whl (138kB)
[K     |████████████████████████████████| 143kB 52.9MB/s 
Collecting pycryptodome
[?25l  Downloading https://files.pythonhosted.org/packages/2b/6f/7e38d7c97fbbc3987539c804282c33f56b6b07381bf2390deead696440c5/pycryptodome-3.9.9-cp36-cp36m-manylinux1_x86_64.whl (13.7MB)
[K     |███████████████████████████

In [2]:
import pathlib
from urllib.parse import urljoin

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import requests
from tqdm.notebook import tqdm

In [4]:
def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

In [5]:
url = "https://www.fukushihoken.metro.tokyo.lg.jp/iryo/kansen/todokedehcyouseisya.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

In [6]:
r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

In [7]:
tag = soup.select_one("div#main p.filelink > a.pdf")

In [8]:
link = urljoin(url, tag.get("href"))

In [9]:
path_pdf = fetch_file(link)

In [10]:
dfs = []

with pdfplumber.open(path_pdf) as pdf:

    for page in tqdm(pdf.pages):

        table = page.extract_table()

        df_tmp = pd.DataFrame(table[1:], columns=table[0])

        dfs.append(df_tmp)

HBox(children=(FloatProgress(value=0.0, max=512.0), HTML(value='')))




In [11]:
df = pd.concat(dfs)

In [12]:
df.shape

(36858, 11)

In [13]:
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.strip().str.normalize("NFKC")

In [14]:
path_csv = path_pdf.with_suffix(".csv")

In [15]:
df.to_csv(path_csv, encoding="utf_8_sig", index=False)

In [16]:
df1 = df.copy()

# データラングリング

In [17]:
import datetime

In [18]:
dt_now = datetime.datetime.now()

In [19]:
def str2date(s: str) -> pd.Series:

    df = s.str.extract("(\d{1,2})月(\d{1,2})日").rename(columns={0: "month", 1: "day"}).fillna(0).astype(int)

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")

In [20]:
df1["リリース日YMD"] = str2date(df1["リリース日"])
df1["発症日YMD"] = str2date(df1["発症日"])
df1["確定日YMD"] = str2date(df1["確定日"])

In [21]:
p = path_csv.with_name(path_csv.name.replace(".csv", "_c.csv"))

In [22]:
df1.to_csv(p, index=False, encoding="utf_8_sig")

# ダウンロード

In [23]:
from google.colab import files

In [24]:
files.download(str(p))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>