In [1]:
import sys
sys.path.append('../../myenv/lib/python3.7/site-packages')
sys.path.append('~/Library/Caches/Homebrew/downloads/bc3fcd9890493f143dbada17d9627acd14efd68dfb72b195a7abca9df3f93361--wkhtmltox-0.12.6-2.macos-cocoa.pkg')

In [2]:
from dateutil.relativedelta import relativedelta
from urllib.request import urlopen
from bs4 import BeautifulSoup
from japanera import Japanera

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import camelot
%matplotlib inline

In [3]:
class Time: # change class name to Time
    
    """Time stamp for code execution"""
        
    def get_current_time(self):
        now = datetime.datetime.now()
        return(now)
    
    def get_start_time(self):
        # Start time
        start_time = self.get_current_time()
        return(start_time)
       
    def get_end_time(self):
        # End time
        end_time = self.get_current_time()
        elapsed_time = end_time - self.get_start_time()
        return(end_time, elapsed_time)
        
    def print_start(self):
        print('--------Start Script--------')
        print('--------Start Time: ' + self.get_start_time().strftime('%Y-%m-%d %H:%M:%S') + '-------\n')

    def print_end(self):
        print('Total ' + str(self.get_end_time()[1].seconds) + ' [sec]')
        print('-----End Time : ' + self.get_end_time()[0].strftime('%Y-%m-%d %H:%M:%S') + ' ---------')
        print('-----END SCRIPT------') 

In [4]:
class Contents():
    
    """A class to find appropriate link contents"""
    
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
        
    def __init__(self, url):
        """Initialize url attribute.
        To access the attributes of an instance, 
        use dot notation"""
        self.url = url
        
    def open_url(self):
        html = urlopen(self.url)
        soup = BeautifulSoup(html, 'lxml')
        return(soup)
    
    def get_urls(self):
        soup = self.open_url()
        all_links = soup.find_all("a")
        return(all_links)

In [5]:
class Data():
    
    """Manipulate data with scraped content"""
    
    def find_data(self, test_tables):
        for table in test_tables:
            table_content = table.find('tr')
            print(table_content.find('th'))
#             print(table_content)
            if (table_content.find('td') and table_content.find('td').text == '国・地域'):
                found = table
                break
            elif (table_content.find('th') and table_content.find('th').text == '国・地域'):
                found = table
                break
            elif (table_content.find('td') and table_content.find('td').text == '　'):
                found = table
                break
            elif (table_content.find('td') and '国・地位' in table_content.find('td').text):
                found = table
                break
            else:
                print("Skip table: Not labeled '国・地域'") 
        try:
            found
        except NameError:
            print("Error: Did not find table labeled '国・地域'")
        else:
            print("Found table labeled '国・地域'")
            return(found)

    def clean_data(self, data):
        cleantext = [text for text in data.stripped_strings]
        print("Changed data to str type.")
        print("Cleanted text.")
        return(cleantext)
    
    def clean_int(self, date_str):
        cleaned = ''
        for i in date_str:
            try: 
                clean = str(int(i))
            except ValueError:
                clean = i
            cleaned += clean
        return(cleaned)
             
    def extract_pdf_table(self, url):
        tables = camelot.read_pdf(url)
        return(tables)
    
    def data_dict_df(self, list_data_values, list_col_names):
        data_dict = {key: [] for key in list_col_names}
        for col, data in enumerate(list_data_values):
            for value in range(len(data)):
                data_dict[list_col_names[col]].append(data[value].text.strip())
        df = pd.DataFrame(data_dict)    
        return(df)

    def export_table(self, table, file_name, file_type):
        # export individually
        path = '../../data/raw/'
#         path = '~/Desktop/missing_csv/'
        
        if file_type == "csv":
            file_type = ".csv"
            table.to_csv(path + file_name + file_type)
        elif file_type == "txt":
            file_type = ".txt"
            with open(path + file_name + file_type, "w") as text_file:
                text_file.write(table)
        else:
            print('Error: Indicate data, filename, filetype - txt or csv')

In [6]:
def get_target_links(url, month):
    data = Data()
    
    contents_00 = Contents(url)
    all_links = contents_00.get_urls()
    data_links = []

    ###Step 1: Find link to data###
    print("Step 1: Find link to data")

    for link in all_links:
        if ['m-listLinkMonth__link'] == link.get("class"):
            if 'https://www.mhlw.go.jp/stf/houdou/houdou_list_2020' in link.get("href"):
                data_links.append(link.get("href"))

    # number of url to monthly data
    print("Number of links: " + str(len(data_links)))

    # choose monthly data here [-1] is the most recent 
    url = sorted(data_links)[month - 1] #############change month#################
    print("Url for a specific month: " + url)

    contents_01 = Contents(url)
    all_links = contents_01.get_urls()
    target_links = []

    for link in all_links:
        for child in link.descendants:
            if '新型コロナウイルス感染症の現在の状況と厚生労働省の対応について（令和２年' in child:
                target = child.parent.previous_element.previous_element.previous_element
                target_link = 'https://www.mhlw.go.jp' + target.get('href')
                target_links.append(target_link)
    #     url = target_links[4] #change day in month
    print("Number of links: " + str(len(target_links)))
    return(target_links)

def corona_country(target_links):

    data = Data()
    """国別コロナ数"""
    
    for url in target_links:
        print("Chosen url to data: " + url)
        ###Step 2: Get content of data###
        print("Step 2: Get content of data")

        content3 = Contents(url)
        soup = content3.open_url()

        # Get title
        title = str(soup.title)
        print("This is the title of url: " + title)

        ###Step 3: Format Date of data###
        print("Step 3: Format Date of data")

        # Fetch and format date 令和02年5月30日版
        date = title[title.find("令和"):title.find("日")+1]
        date = data.clean_int(date)
        date_formatted = date[:2] + '0' + date[2:]
        print("Check date format: " + date_formatted)
        janera = Japanera()
        date_reformat = janera.strptime(date_formatted, '%-E%-O年%m月%d日')
        year = str(date_reformat[0].year)
        month = str(date_reformat[0].month)
        day = str(date_reformat[0].day)
        print("Formatted date: " + date_formatted)

        ###Step 4: Find table and clean table###
        print("Step 4: Find table and clean table")

        # Get table
        print("Numbers of tables found: " + str(len(soup.find_all('table'))))
        find = soup.find_all('table')
        find_data = data.find_data(find)
        clean_data = data.clean_data(find_data)
        print(clean_data)
        clean_data = str(clean_data)

        ###Step 5: Choose file name and export data###
        print("Step 5: Choose file name and export data")

        date_formatted = '{}_{}_{}'.format(year,month,day)
        file_name_00 = date_formatted + '_corona_country'
        data.export_table(clean_data, file_name_00, "txt")
        print("Data exported as: " + file_name_00)

def pcr_japan(target_links):
    data = Data()

    """国内における都道府県別のPCR検査陽性者数"""

    ###Step 2: Get content of data###
    print("Step 2: Get content of data")

    for url in target_links:
        contents_02 = Contents(url)
        soup = contents_02.open_url()

        links = []
        for link in soup.find_all("a"):
            for child in link.descendants:
                if '国内における都道府県別のPCR検査陽性者数（2020年' in child:
#                 if '新型コロナウイルス陽性者数とPCR検査実施人数（都道府県別）（2020年' in child: #april, march
#                 if '国内における都道府県別のPCR検査陽性者数（2020年' in child: #missing data
                    target_link = 'https://www.mhlw.go.jp' + link.get("href")
                    links.append(child)
                    links.append(target_link)

        info = sorted(list(set(links)))
        print(sorted(info))
         
        print("length of list: " + str(len(info)))
        
        if len(info) > 2:
            info.pop(0)
        elif len(info) != 0:
            info 
        else:
            print("Error: Cannot find link to data.")
            continue
            
        date = info[1][info[1].find("年")-4:info[1].find("日")+1]
        date = data.clean_int(date)
        print("Cleaned date: " + date)
        date_format = datetime.datetime.strptime(date, '%Y年%m月%d日')
        date_formatted = '{}_{}_{}'.format(date_format.year,date_format.month,date_format.day)
        print("date formatted: " + date_formatted)

        file = info[0]
        print("PDF file extracted: " + file)
        
        tables = data.extract_pdf_table(file)
        # number of tables extracted
        print("Total tables extracted:", tables.n)
        
        if tables.n == 0:
            print("Error : No tables Found")
        else:
            tables
        for i, table in enumerate(tables):
            extracted = table.df
            file_name = date_formatted + '_corona_jp_{}'.format(i+1)
            data.export_table(extracted, file_name, "csv") 

In [11]:
class Main():
    
    """Get data for corona deaths by country"""
    time = Time()
    time.print_start()  
    
    ###########################################
    #monthly 報道発表資料 (main link)
    url = "https://www.mhlw.go.jp/stf/houdou/index.html"
    #set month of data
    month = 12 #For some months, data doesn't exist OR posted in different structure.
    ###########################################
#     url = target_links[0] #most recent date

    target_links = get_target_links(url, month)
    target_links = [target_links[0]]
    
    #国内における都道府県別のPCR検査陽性者数#
    pcr_japan(target_links)

    #国別コロナ数#
#     corona_country(target_links) #Only half of Feb works. 

    time.print_end()
    
if __name__ == '__main__':
    Main()

--------Start Script--------
--------Start Time: 2021-01-02 10:10:48-------

Step 1: Find link to data
Number of links: 12
Url for a specific month: https://www.mhlw.go.jp/stf/houdou/houdou_list_202012.html
Number of links: 31
Step 2: Get content of data
['https://www.mhlw.go.jp/content/10906000/000713248.pdf', '国内における都道府県別のPCR検査陽性者数（2020年12月31日掲載分）']
length of list: 2
Cleaned date: 2020年12月31日
date formatted: 2020_12_31
PDF file extracted: https://www.mhlw.go.jp/content/10906000/000713248.pdf
Total tables extracted: 2
Total 86399 [sec]
-----End Time : 2021-01-02 10:10:52 ---------
-----END SCRIPT------


In [43]:
##########################
#国内における都道府県別の人口#
##########################

data = Data()

def get_file_n(date_formatted, col_names):
    file_names = ['{}_{}'.format(date_formatted, name) for name in col_names]
    return(file_names)

# def population_prefecture_japan():
# most recent population data
# url = 'https://uub.jp/rnk/p_j.html'
# Relavent dates to use for corona data 
# url = 'https://uub.jp/rnk/rnk.cgi?T=p&S=j&B=20191001'
url = 'https://uub.jp/rnk/rnk.cgi?T=p&S=j&B=20201001'


content5 = Contents(url)
soup = content5.open_url()
description = soup.find('meta', attrs={'name': 'description'})
print(description)
desc_cleaned = description.prettify()
date = desc_cleaned[desc_cleaned.find("年")-4:desc_cleaned.find("日")+1]
print(date)
date_format = datetime.datetime.strptime(date, '%Y年%m月%d日')
date_formatted = '{}_{}_{}'.format(date_format.year,date_format.month,date_format.day)


file_name1, file_name2, file_name3 = get_file_n(date_formatted, ["population", "area", "popDensity"])

# Find tables
find = soup.find_all('td') 
table1 = find[3:147] # 都道府県別　人口データ
table2 = find[147:147 + 144] # 都道府県別　面積データ
table3 = find[147 + 144:147 + 144 + 144] # 都道府県別　人口密度データ

# Clean data
# Get prefectures
test1 = table1[4::3]
test2 = table2[4::3]
test3 = table3[4::3]

# Get data
pop = table1[5::3]
area = table2[5::3]
den = table3[5::3]

# Construct dataframe
export_df1 = data.data_dict_df([test1, pop], ["pref", "population"])
export_df2 = data.data_dict_df([test2, area], ["pref", "area"])
export_df3 = data.data_dict_df([test3, den], ["pref", "popDensity"])

# Export dataframe
data.export_table(export_df1, file_name1, "csv")
data.export_table(export_df2, file_name2, "csv")
data.export_table(export_df3, file_name3, "csv")

<meta content="都道府県の人口ランキング・面積ランキング・人口密度ランキングです。人口は、2020年10月1日の推計人口によります。推計人口とは、直近の国勢調査確定人口を基に、その後の人口動向を他の人口関連資料から得て算出するもので、住民基本台帳人口とは違い、より実際の人口に近い数が算出されます。" name="description"/>
2020年10月1日


In [None]:
     ##############
#     #国内の自殺者数#
      ##############  
#     # file2 = 'https://www.npa.go.jp/safetylife/seianki/jisatsu/R02/R01_jisatuno_joukyou.pdf'
#     # tables = camelot.read_pdf(file2, pages='33')
#     # # number of tables extracted
#     # print("Total tables extracted:", tables.n)
#     # tables[0].df