<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/covid2019_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 最新レポートをPDFファイルを調べる

In [0]:
import requests
from bs4 import BeautifulSoup

import re
import urllib.parse

import datetime

In [0]:
url = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
# クエリーの除去
def remove_all_query(url):
    return urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(query=None))

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

In [0]:
# レポートのタグ取得
href = soup.find(string=re.compile(r"^Situation report - \d{1,2}")).find_parent("a")

In [6]:
# レポートのPDFのリンクからクエリーを除去
link = remove_all_query(urllib.parse.urljoin(url, href.get("href")))

print(link)

https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200220-sitrep-31-covid-19.pdf


In [7]:
# レポートの日付を取得
dt = datetime.datetime.strptime(href.find_parent("p").contents[-1], "%d %B %Y")

print(dt)

2020-02-20 00:00:00


## レポートをダウンロード

In [0]:
import os

# ファイル名作成
filename = os.path.basename(link)

if not os.path.exists(filename):

    # ダウンロード
    r = requests.get(link)

    r.raise_for_status()

    # ファイル保存
    with open(filename, "wb") as fw:
        fw.write(r.content)

## PDFを分析

In [9]:
!pip install pdfminer.six

Collecting pdfminer.six
[?25l  Downloading https://files.pythonhosted.org/packages/60/0a/5806bd37362bceebb88cff526177c308276b3e0696611564ed01d67b8c6b/pdfminer.six-20200124-py3-none-any.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 2.8MB/s 
Collecting pycryptodome
[?25l  Downloading https://files.pythonhosted.org/packages/54/e4/72132c31a4cedc58848615502c06cedcce1e1ff703b4c506a7171f005a75/pycryptodome-3.9.6-cp36-cp36m-manylinux1_x86_64.whl (13.7MB)
[K     |████████████████████████████████| 13.7MB 27.2MB/s 
[?25hInstalling collected packages: pycryptodome, pdfminer.six
Successfully installed pdfminer.six-20200124 pycryptodome-3.9.6


In [0]:
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTTextBox
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage

In [0]:
resourceManager = PDFResourceManager()
device = PDFPageAggregator(resourceManager, laparams=LAParams())

In [0]:
y_head = 720

In [0]:
# テキスト範囲
x_min, x_max = 420, 575
y_min, y_max = 400, 700

In [0]:
with open(filename, "rb") as fp:

    interpreter = PDFPageInterpreter(resourceManager, device)

    pages = []
    tmp = []

    for num, page in enumerate(PDFPage.get_pages(fp, maxpages=5), 1):

        interpreter.process_page(page)

        layout = device.get_result()

        for l in layout:

            if num == 1:

                if isinstance(l, LTTextBoxHorizontal):

                    # 範囲のテキスト抽出
                    if x_min < l.x0 < l.x1 < x_max:
                        if y_min < l.y0 < l.y1 < y_max:
                            tmp.append(l.get_text().strip())

            else:

                if isinstance(l, LTTextBox):

                    if y_head < l.y0:

                        m = re.search(r"Table \d", (l.get_text()))

                        if m:
                            pages.append(str(num))
    device.close()

## 感染者・死亡者数

In [0]:
text = "\n".join(tmp) if tmp else ""

In [0]:
# 感染者数を取得
conf = [
    int(i[0].replace(" ", "") if i[0] else 0)
    for i in re.findall("([0-9 ]+) (laboratory-)?confirmed", text)
]

In [0]:
# 死亡者数を取得
deaths = [
    int(i.replace(" ", "") if i else 0)
    for i in re.findall("([0-9 ]+) death", text)
]
deaths.extend([0])

In [18]:
world = [
    dt.strftime("%Y-%m-%d"),
    conf[0],
    deaths[0] + deaths[1],
    conf[1],
    deaths[0],
]

print(world)

['2020-02-20', 75748, 2129, 74675, 2121]


## エリア別

In [19]:
!apt install ghostscript
!pip install camelot-py[cv]

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono gsfonts libcupsfilters1 libcupsimage2
  libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
Suggested packages:
  fonts-noto ghostscript-x poppler-utils fonts-japanese-mincho
  | fonts-ipafont-mincho fonts-japanese-gothic | fonts-ipafont-gothic
  fonts-arphic-ukai fonts-arphic-uming fonts-nanum
The following NEW packages will be installed:
  fonts-droid-fallback fonts-noto-mono ghostscript gsfonts libcupsfilters1
  libcupsimage2 libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
0 upgraded, 11 newly installed, 0 to remove and 25 not upgraded.
Need to get 14.1 MB of archives.
After this operation, 49.9 MB of additional disk space will be used.
Get:1 http:

In [0]:
import pandas as pd
import camelot

In [0]:
# pages = ページ番号
tables = camelot.read_pdf(filename, pages=",".join(pages), split_text=True, strip_text='\n', line_scale=40)

In [22]:
# 中国
tables[0].to_csv("china.csv")

df0 = tables[0].df
df0

Unnamed: 0,0,1,2,3,4,5,6
0,Province/ Region/ City,"Population (10,000s)",Daily,,,Cumulative,
1,,,Confirmed cases,Suspected cases,Deaths,Confirmed cases,Deaths
2,Hubei,5917,349,880,108,62031,2029
3,Guangdong,11346,1,1,0,1332,5
4,Henan,9605,4,66,0,1265,19
5,Zhejiang,5737,2,6,0,1175,0
6,Hunan,6899,2,5,0,1010,4
7,Anhui,6324,1,0,0,987,6
8,Jiangxi,4648,1,0,0,934,1
9,Jiangsu,8051,0,1,0,631,0


In [23]:
# 世界
tables[1].to_csv("country.csv")

df1 = tables[1].df
df1

Unnamed: 0,0,1,2,3,4,5,6
0,Country/Territory/Area,Confirmed* cases (new),Likely place of exposure†,,,Total cases with site of transmission under in...,Total deaths (new)
1,,,China (new),Outside reporting country and outside China (new),In reporting country (new),,
2,Western Pacific Region,,,,,,
3,Republic of Korea,104 (53),13 (0),4 (0),72 (43),15 (10),1 (1)
4,Japan,85 (12),26 (0),5 (2),46 (7),8 (3),1 (0)
5,Singapore,84 (3),23 (0),0 (0),54 (3),7 (0),0 (0)
6,Malaysia,22 (0),17 (0),1 (0),2 (0),2 (0),0 (0)
7,Viet Nam,16 (0),8 (0),0 (0),8 (0),0 (0),0 (0)
8,Australia,15 (0),12 (0),0 (0),3 (0),0 (0),0 (0)
9,Philippines,3 (0),3 (0),0 (0),0 (0),0 (0),1 (0)


In [24]:
# 日本
df1.set_index(0, inplace=True)

s1 = df1.loc["Japan", :]

s1.str.extract("(\d+) \((\d+)\)")

Unnamed: 0,0,1
1,85,12
2,26,0
3,5,2
4,46,7
5,8,3
6,1,0
