In [1]:
import tabula



In [27]:
from bs4 import BeautifulSoup

patterns = [
    {
        'type': 'confirmed',
        'regex_pattern': '国内事例における都道府県別の患者報告数'
    },
    {
        'type': 'tested',
        'regex_pattern': '新型コロナウイルス陽性者数とPCR検査実施人数（都道府県別）'
    },
]

def get_pdf_links(patterns):
    url = 'https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000164708_00001.html'
    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content)
    
    for pattern in patterns:
        pdf_link = soup.find("a", text=re.compile(pattern['regex_pattern']))['href']
        pattern['pdf_link'] = pdf_link
    return patterns

pdf_links = get_pdf_links(patterns)

In [28]:
import requests
import os

def download_pdf(url: str, data_dir: str):
    r = requests.get(url)
    r.raise_for_status()

    filename = os.path.basename(url)
    filepath = os.path.join(data_dir, filename)

    os.makedirs(data_dir, exist_ok=True)
    with open(filepath, 'wb') as f:
        f.write(r.content)
    return filepath

for link in pdf_links:
    filepath = download_pdf(link['pdf_link'], './data')
    link['filepath'] = filepath
pdf_links

[{'type': 'confirmed',
  'regex_pattern': '国内事例における都道府県別の患者報告数',
  'pdf_link': 'https://www.mhlw.go.jp/content/10906000/000625313.pdf',
  'filepath': './data/000625313.pdf'},
 {'type': 'tested',
  'regex_pattern': '新型コロナウイルス陽性者数とPCR検査実施人数（都道府県別）',
  'pdf_link': 'https://www.mhlw.go.jp/content/10906000/000625317.pdf',
  'filepath': './data/000625317.pdf'}]

In [149]:
for link in pdf_links:
    print(link['pdf_link'])
    if link['type'] is 'confirmed':
        df = JapanMHLWConfirmedExtractor().transform_from_file(link['filepath'])
        link['df'] = df
    if link['type'] is 'tested':
        df = JapanMHLWTestedExtractor('jp-tested-date-v2.json', 'jp-tested-tables-v2.json').transform_from_file(link['filepath'])
        link['df'] = df
pdf_links

https://www.mhlw.go.jp/content/10906000/000625313.pdf


Got stderr: Apr 27, 2020 6:59:02 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:02 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+MS-Gothic are not implemented in PDFBox and will be ignored
Apr 27, 2020 6:59:03 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:03 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+MS-PGothic are not implemented in PDFBox and will be ignored
Apr 27, 2020 6:59:03 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14



https://www.mhlw.go.jp/content/10906000/000625317.pdf


Got stderr: Apr 27, 2020 6:59:06 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:06 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+YuGothic-Regular are not implemented in PDFBox and will be ignored
Apr 27, 2020 6:59:06 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:06 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:06 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+YuGothic-Bold are not implemented in PDFBox and will be ignored
Apr 27, 2020 6:59:06 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14

Got stderr: Apr 27, 2020 6:59:08 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14
Apr 27, 2020 6:59:08 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+YuGothic-Regular are not implemented in PDFBox and will be ignored
Apr 27, 2020 6

  新型コロナウイルス陽性者数(チャーター便帰国者を除く)とPCR検査
0            実施人数(都道府県別)【1/15~4/25】
2020-04-25 00:00:00


[{'type': 'confirmed',
  'regex_pattern': '国内事例における都道府県別の患者報告数',
  'pdf_link': 'https://www.mhlw.go.jp/content/10906000/000625313.pdf',
  'filepath': './data/000625313.pdf',
  'df':          date pref_jp    pref_en  confirmed  hospitalized  discharged  \
  0  2020-04-25     埼玉県    Saitama      786.0         737.0        34.0   
  1  2020-04-25     千葉県      Chiba      778.0         662.0        97.0   
  2  2020-04-25     東京都      Tokyo     3850.0        3772.0        59.0   
  3  2020-04-25    神奈川県   Kanagawa      943.0         798.0       120.0   
  4  2020-04-25     大阪府      Osaka     1477.0        1204.0       246.0   
  5  2020-04-25     兵庫県      Hyogo      619.0         530.0        72.0   
  6  2020-04-25     福岡県    Fukuoka      595.0         442.0       138.0   
  8  2020-04-25     北海道   Hokkaido      601.0         375.0       201.0   
  9  2020-04-25     茨城県    Ibaraki      158.0         128.0        24.0   
  10 2020-04-25     石川県   Ishikawa      224.0         203.0        16.

In [151]:
import re
from datetime import datetime
import pandas as pd

PREFECTURE_DICT = pd.read_csv('pref.txt').to_dict(orient='records')

def covnert_to_pref_en(row):
    result = [elem['pref_en'] for elem in PREFECTURE_DICT if elem['pref_jp'] in row]
    return result[0] if len(result) == 1 else ''


class JapanMHLWConfirmedExtractor:
    COLUMNS_ORIG_11 = ['pref_jp', 'nan', 'confirmed', 'day-over-day', 'per-10000',
                    'hospitalized', 'hospitalized_pct',
                    'discharged', 'discharged_pct',
                    'deceased', 'deceased_pct']
    COLUMNS_ORIG_10 = ['pref_jp', 'confirmed', 'day-over-day', 'per-10000',
                    'hospitalized', 'hospitalized_pct',
                    'discharged', 'discharged_pct',
                    'deceased', 'deceased_pct']

    COLUMNS = ['date',
               'pref_jp',
               'pref_en',
               'confirmed',
               'hospitalized',
               'discharged',
               'deceased']


    def extract_date(self, input_str):
        date_jp = '2020年' + re.sub(r'都|道|府|県|関|数| |\(.+?\)', '', input_str)
        return datetime.strptime(date_jp, '%Y年%m月%d日')


    def transform_from_file(self, pdf_file):
        df = tabula.read_pdf(pdf_file, pages=1)
        return self.transform(df[0])

    def transform(self, input_df):
    #     print(len(input_df.columns), input_df.columns.to_list())

        df = input_df.copy()
        len_columns = len(df.columns)
        date = self.extract_date(df.columns[1])

        if len_columns == 11:
            df.columns = self.COLUMNS_ORIG_11
        elif len_columns == 10:
            df.columns = self.COLUMNS_ORIG_10
        else:
            return None

        # Remove N/A data
        if 'nan' in df.columns:
            del df['nan']
        df.dropna(inplace=True)

        # Transform
        ## For numeric conversion
        if df['confirmed'].dtypes != 'int64':
            df['confirmed'] = df['confirmed'].apply(lambda x: re.sub(r'[^0-9.]', '', str(x)))
        df = df.apply(pd.to_numeric, errors='ignore')

        ## For prefecture name
        df = df[~df['pref_jp'].isin(['総計', 'その他'])]
        df['pref_en'] = df['pref_jp'].apply(covnert_to_pref_en)
        df['date'] = date
        return df[self.COLUMNS]

# result = JapanMHLWConfirmedExtractor().transform_from_file('./data/000622728.pdf')
# result

In [152]:
import re
import numpy as np

class JapanMHLWTestedExtractor:
    COLUMNS_ORIG = ['pref_jp', 'positve', 'tested', 'pct']
    COLUMNS = ['date', 'pref_jp', 'pref_en', 'positve', 'tested']
    PREFECTURE_DICT = pd.read_csv('pref.txt')['pref_jp'].values

    def __init__(self, tabula_template_file_for_date, tabula_template_file_for_tables):
        self.tabula_template_file_for_date = tabula_template_file_for_date
        self.tabula_template_file_for_tables = tabula_template_file_for_tables

    def extract_date(self, df_date):
        m = re.search(r'【(.+?)~(.+?)】', df_date.to_string())
        if len(m.groups()) != 2:
            return None
        date = '2020/' + m.groups()[1]
        return datetime.strptime(date, '%Y/%m/%d')

    def transform_from_file(self, pdf_file):
        dfs = tabula.read_pdf_with_template(pdf_file, self.tabula_template_file_for_date)
        df_date = dfs[0]
        dfs = tabula.read_pdf_with_template(pdf_file, self.tabula_template_file_for_tables, lattice=True)
        for df in dfs:
            df.columns = self.COLUMNS_ORIG
        df = pd.concat(dfs, axis=0)
        return self.transform(df, self.extract_date(df_date))

    def clean_pref_name(self, row):
        for pref_jp in self.PREFECTURE_DICT:
            if pref_jp.replace('県','').replace('府','').replace('都','') in str(row):
                return pref_jp
        return np.nan

    def fill_tested(self, row):
        if row['tested'] is np.nan:
            row['tested'] = row['tested_shift']
        return row['tested']

    def clean_numbers(self, row):
        string = re.sub(r'\r.*', '', row)
        string = re.sub(r'[^0-9.]', '', string)
        return string

    def transform(self, input_df, date):
        print(date)
        df = input_df.copy()

        # Fill missing tested data from the following row
        df['tested_shift'] = df['tested'].shift(-1)
        df['tested'] = df.apply(self.fill_tested, axis=1)
        del df['tested_shift']

        # Deal with prefecture name
        df['pref_jp'] = df['pref_jp'].apply(self.clean_pref_name)

        # Drop columns/NA data
        del df['pct']
        df.dropna(inplace=True)

        # Clean numbers
        df['positve'] = df['positve'].apply(self.clean_numbers)
        df['tested'] = df['tested'].apply(self.clean_numbers)
        df = df.apply(pd.to_numeric, errors='ignore')

        df['pref_en'] = df['pref_jp'].apply(covnert_to_pref_en)
        df['date'] = date
        return df[self.COLUMNS]

# result = JapanMHLWTestedExtractor('jp-tested-date-v2.json', 'jp-tested-tables-v2.json').transform_from_file('./data/000625317.pdf')
# result.head(3)

In [315]:
combined = pd.merge(result_c, result_t, left_on=['date','pref_jp','pref_en'], right_on=['date','pref_jp','pref_en'])
combined

Unnamed: 0,date,pref_jp,pref_en,confirmed,hospitalized,discharged,deceased,positve,tested
0,2020-04-09,東京都,Tokyo,1528,1453,57,18,1528,4992
1,2020-04-09,大阪府,Osaka,589,465,118,6,589,1792
2,2020-04-09,千葉県,Chiba,342,306,35,1,342,1500
3,2020-04-09,愛知県,Aichi,299,225,53,21,299,3467
4,2020-04-09,兵庫県,Hyogo,273,203,58,12,273,3496
5,2020-04-09,埼玉県,Saitama,271,241,25,5,271,1934
6,2020-04-09,福岡県,Fukuoka,187,180,7,0,187,3873
7,2020-04-09,京都府,Kyoto,159,129,29,1,159,1867
8,2020-04-09,茨城県,Ibaraki,81,75,4,2,81,2140
9,2020-04-09,岐阜県,Gifu,76,70,5,1,76,1110
