In [1]:
import sys

In [2]:
sys.path.append('../../myenv/lib/python3.7/site-packages')

In [3]:
from dateutil.relativedelta import relativedelta
from urllib.request import urlopen
from bs4 import BeautifulSoup
from japanera import Japanera

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import camelot
%matplotlib inline

In [4]:
class Date:
    
    """Time stamp for code execution"""
        
    def get_current_time(self):
        now = datetime.datetime.now()
        return(now)
    
    def get_start_time(self):
        # Start time
        start_time = self.get_current_time()
        return(start_time)
       
    def get_end_time(self):
        # End time
        end_time = self.get_current_time()
        elapsed_time = end_time - self.get_start_time()
        return(end_time, elapsed_time)
        
    def print_start(self):
        print('--------Start Script--------')
        print('--------Start Time: ' + self.get_start_time().strftime('%Y-%m-%d %H:%M:%S') + '-------\n')

    def print_end(self):
        print('Total ' + str(self.get_end_time()[1].seconds) + ' [sec]')
        print('-----End Time : ' + self.get_end_time()[0].strftime('%Y-%m-%d %H:%M:%S') + ' ---------')
        print('-----END SCRIPT------') 

In [5]:
class Contents():
    
    """A class to find appropriate link contents"""
    
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
        
    def __init__(self, url):
        """Initialize url attribute.
        To access the attributes of an instance, 
        use dot notation"""
        self.url = url
        
    def open_url(self):
        html = urlopen(self.url)
        soup = BeautifulSoup(html, 'lxml')
        return(soup)
    
    def get_urls(self):
        soup = self.open_url()
        all_links = soup.find_all("a")
        return(all_links)

#     def get_title(self):
#         # Get the title of content
#         title = self.open_url().title
#         print(title)
        
#     def recent_link(self, data_links):
#         #last link in list is most recent
# #         url = sorted(data_links)[-1]
#         url = sorted(data_links)[-2]
#         return(url)
        
#     def fetch_date(self):

In [6]:
class Data():
    
    """Manipulate data with scraped content"""
    
#     def __init__(self, list_df):
#         self.list_df = list_df
        
    def find_data(self, find):
#         find = soup.find_all('td')
        for row in range(10):
            if find[row].find('strong') and find[row].find('strong').text == '死亡者' and find[row-1].find('strong').text == '感染者':
                start_row = row + 1
                break
            else:
                print("No row (td) starting with '死亡者' and '感染者'")

        data = find[start_row:start_row+len(find[start_row:])]
        return(data)

    def clean_data(self, data):
#         str_cells = str(table) #rows = find[start_row:]
        str_cells = str(data)
        cleantext = BeautifulSoup(str_cells, "lxml").get_text()
        print("Changed data to str type.")
        print("Cleanted text.")
        return(cleantext)
        
    def format_data(self, cleantext):
        # Converting string to list 
        res = cleantext.strip('][').split(', ') 

        def list_slice(S, step):
            return [S[i::step] for i in range(step)]
        print(list_slice(res[0:],3))

        list_df = list_slice(res[0:],3)
        return(list_df)
       
    def check_row_num(self, list_df):
        #check if length of rows match
        num_rows = []
        check = [len(list_df[row]) for row in range(len(list_df))]
        if len(set(check)) == 1:
            print("Number of rows in lists are equal, ready to construct dataframe.")
        else:
            print("Error: Number of rows in list are unequal.")
            
    def create_df(self, list_df):
        df = pd.DataFrame()
        df['Country'] = list_df[0]
        df['Infected'] = list_df[1]
        df['Deaths'] = list_df[2]
        print(df.head())
        return(df)
            
    def change_type(self, df):
        df['Infected'] = df['Infected'].str.replace(",","").astype(int)
        df['Deaths'] = df['Deaths'].str.replace(",","").astype(int)
        print(df.dtypes)
        
    def extract_pdf_table(self, url):
        tables = camelot.read_pdf(url)
        # number of tables extracted
        print("Total tables extracted:", tables.n)
        # print the first table as Pandas DataFrame
        return(tables[0].df)

    def export_table(self, table, file_name):
        # export individually
        path = '../../data/raw/'
        table.to_csv(path + file_name)  

In [8]:
class Main():
    
    """Get data for corona deaths by country"""
    time = Date()
    time.print_start()
   
    url = "https://www.mhlw.go.jp/stf/houdou/index.html" #monthly 報道発表資料 (main link)
    contents = Contents(url)
    all_links = contents.get_urls()
    data_links = []

    for link in all_links:
        if ['m-listLinkMonth__link'] == link.get("class"):
            if 'https://www.mhlw.go.jp/stf/houdou/houdou_list_2020' in link.get("href"):
                print(link.get("href"))
                data_links.append(link.get("href"))

    url = sorted(data_links)[-1]
    contents = Contents(url)
    all_links = contents.get_urls()
    target_links = []

    for link in all_links:
        for child in link.descendants:
            if '新型コロナウイルス感染症の現在の状況と厚生労働省の対応について（令和２年' in child:
                print(child.parent.previous_element.previous_element.previous_element)
                target = child.parent.previous_element.previous_element.previous_element
                target_link = 'https://www.mhlw.go.jp' + target.get('href')
                print(target_link)
                target_links.append(target_link)
    
    url = target_links[0]
    content3 = Contents(url)
    soup = content3.open_url()

    # Get title
    title = str(soup.title)
    print(title)

    # Fetch date
    date = title[title.find("（")+1:title.find("）")-1]
    date_formatted = date[:2] + '０' + date[2:]

    janera = Japanera()
    janera.strptime(date_formatted, '%-E%-O年%m月%d日')    

    # Get table
    #国別コロナ数
    find = soup.find_all('td')
    data = Data()
    find_data = data.find_data(find)
    clean_data = data.clean_data(find_data)
    list_df = data.format_data(clean_data)
    data.check_row_num(list_df)
    df = data.create_df(list_df)
    df1 = data.change_type(df)

    # df_index = df.set_index('Country')
    
    
    # 国内における都道府県別のPCR検査陽性者数
    links = []
    dict_link = {}
    for link in soup.find_all("a"):
        for child in link.descendants:
            if '国内における都道府県別のPCR検査陽性者数（2020年' in child:
                target_link = 'https://www.mhlw.go.jp' + link.get("href")
                links.append(child)
                links.append(target_link)

    test = list(set(links))
    print(test[0])
    
    date = test[1][test[1].find("（")+1:test[1].find("）")-3]
    date_formatted = '{}{}{}'.format(date[:2],date[5:7],date[8:10])
    print(date_formatted)

    file_name = date_formatted + '_corona_jp.csv'
    file = test[0]
    contents4 = Contents(file)
    print(file)
    table = data.extract_pdf_table(file) #corona by 都道府県
    data.export_table(table, file_name)
    
    # 国内の自殺者数
    # file2 = 'https://www.npa.go.jp/safetylife/seianki/jisatsu/R02/R01_jisatuno_joukyou.pdf'
    # tables = camelot.read_pdf(file2, pages='33')
    # # number of tables extracted
    # print("Total tables extracted:", tables.n)
    # tables[0].df

    
    time.print_end()
    
if __name__ == '__main__':
    Main()

--------Start Script--------
--------Start Time: 2020-11-19 15:34:39-------

https://www.mhlw.go.jp/stf/houdou/houdou_list_202001.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202002.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202003.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202004.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202005.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202006.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202007.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202008.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202009.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202010.html
https://www.mhlw.go.jp/stf/houdou/houdou_list_202011.html
<a href="/stf/newpage_14945.html">
<div class="m-listNews__txt"><span>新型コロナウイルス感染症の現在の状況と厚生労働省の対応について（令和２年11月18日版）</span>
<em class="m-icnNew">NEW</em></div></a>
https://www.mhlw.go.jp/stf/newpage_14945.html
<a href="/stf/newpage_14911.html">
<div class="m-listNews__txt"><span>新型コロナウイ

Total tables extracted: 2
Total 86399 [sec]
-----End Time : 2020-11-19 15:34:42 ---------
-----END SCRIPT------
