<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/kyoto/kyoto_ocr_cell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v
!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:4 http://ppa.launchpad.net/alex-p/tesseract-ocr/ubuntu bionic InRelease [15.4 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release [564 B]
Get:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release.gpg [833 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:11 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease [21.3 kB]
Get:12 https://cloud.r-p

# スクレイピング

In [2]:
import pathlib
import re

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [4]:
url = "https://www.pref.kyoto.jp/kentai/corona/pcrkensa.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [5]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [6]:
src = soup.find("img", alt=re.compile("pcr$")).get("src")

In [7]:
link = urljoin(url, src)
link

'https://www.pref.kyoto.jp/kentai/corona/images/20200903_pcr.png'

# ダウンロード

In [8]:
def get_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode='wb') as fw:
        fw.write(r.content)

    return p

# pytesseract

In [9]:
import cv2
import numpy as np

In [10]:
import pytesseract

In [11]:
from google.colab.patches import cv2_imshow

In [12]:
jpg_path = get_file(link)

In [13]:
# https://teratail.com/questions/151317

import cv2
import numpy as np

img = cv2.imread(str(jpg_path))

# BGR -> グレースケール
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

gray[gray > 220] = 255
cv2.imwrite("gray.png", gray)

# エッジ抽出 (Canny)
edges = cv2.Canny(gray, 1, 100, apertureSize=3)
cv2.imwrite("edges.png", edges)

# 膨張処理
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
edges = cv2.dilate(edges, kernel)

# 輪郭抽出
contours, hierarchy = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

In [14]:
# 面積でフィルタリング
rects = []

for cnt, hrchy in zip(contours, hierarchy[0]):
    if cv2.contourArea(cnt) < 3000:
        continue  # 面積が小さいものは除く
    if hrchy[3] == -1:
        continue  # ルートノードは除く

    # 輪郭を囲む長方形を計算する。
    rect = cv2.minAreaRect(cnt)
    rect_points = cv2.boxPoints(rect).astype(int)

    # 座標
    x_max, y_max = np.amax(rect_points.T, axis=1)
    x_min, y_min = np.amin(rect_points.T, axis=1)

    dst = gray[y_min: y_max, x_min : x_max]

    dst2 = cv2.resize(dst, None, fx=1.5, fy=1.5)
    # cv2.imwrite(f"gray{i:02}.png", dst)

    txt = pytesseract.image_to_string(dst2, lang="jpn", config="--psm 6").strip()

    if txt:
        rects.append([y_min, y_max , x_min, x_max, txt])

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame(rects, columns=["y1", "y2", "x1", "x2", "text"])

In [17]:
df

Unnamed: 0,y1,y2,x1,x2,text
0,654,703,610,678,7
1,654,703,537,606,25
2,654,703,465,534,28
3,654,703,392,461,34
4,654,703,319,387,3
5,654,703,247,316,78
6,654,703,167,242,1.340
7,654,703,13,163,9月3日(最新日)
8,581,650,320,388,重\n症
9,537,650,611,679,調\n束\n中


In [18]:
t1 = set(df["x1"].unique()) & set(df["x1"].unique() + 1)

for i in t1:
    df["x1"] = df["x1"].replace(i, i - 1)

In [19]:
t2 = set(df["y2"].unique()) & set(df["y2"].unique() + 1)

for i in t2:
    df["y2"] = df["y2"].replace(i, i - 1)

In [20]:
pv = df.pivot_table(index="y2", columns="x1", values="text", aggfunc=lambda x: " ".join(str(v) for v in x))

In [21]:
pv

x1,11,12,167,247,319,335,392,465,505,537,610
y2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
146,,,陽性率,,,PCR検査実施人数\n| PCR検査陽性者数,,,PCR検査陽人者数,,
227,9月3日(最新日),,3.7%\n※直近一週間の平均,,,368,,,19,,
308,6月16日からの里計\n(最新日含む),,4.9%,,,23.652\n(A),,,1.152\n(D),,
385,6月15日までの里計,,3.9%,,,9.201\n(B),,,360\n),,
465,果計,,4.6%\n(P)/(C),,,32.853\n(C)ニ(A+ (B),,,1.512\n(F) = (D) + (E),,
650,,,勧退\n告院\n解又\n除は,入\n院 | 生\n症,重\n症,,施\n設\n療\n養,自\n宅\n療\n状,,死,調\n束\n中
703,,9月3日(最新日),1.340,78,3,,34,28,,25,7


In [22]:
pv1 = pv.iloc[:-2].copy()

In [23]:
pv1.dropna(thresh=4, inplace=True)
pv1.dropna(how="all", axis=1, inplace=True)

pv1.fillna("", inplace=True)

In [24]:
def data_split(s):
    x = str(s).split()
    if len(x) == 0:
        return ""
    else:
        if x[0].endswith("%"):
            return float(x[0].rstrip("%"))

        t = x[0].replace(".", "")

        if t.isdigit():
            return int(t)

        return t

In [25]:
pv1 = pv1.applymap(data_split)

In [26]:
pv1.set_axis(["日付", "陽性率", "PCR検査実施人数", "PCR検査陽性者数"], axis=1, inplace=True)

In [27]:
pv1["日付"] = pv1["日付"].str.replace("\(最新日\)", "").str.replace("里計", "累計").str.replace("果計", "累計")

In [28]:
pv1.set_index("日付", inplace=True)

In [29]:
pv1

Unnamed: 0_level_0,陽性率,PCR検査実施人数,PCR検査陽性者数
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9月3日,3.7,368,19
6月16日からの累計,4.9,23652,1152
6月15日までの累計,3.9,9201,360
累計,4.6,32853,1512


In [39]:
pv1.to_csv("kensa.csv", encoding="utf_8_sig")

In [30]:
pv2 = pv.iloc[-2:].copy()

In [31]:
pv2.dropna(thresh=8, inplace=True)

In [32]:
pv2.dropna(how="all", axis=1, inplace=True)

In [33]:
pv2

x1,12,167,247,319,392,465,537,610
y2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
703,9月3日(最新日),1.34,78,3,34,28,25,7


In [34]:
s1 = pv2.iloc[0]

In [35]:
s1.index = ["日付", "退院又は勧告解除", "入院", "重症", "施設療養", "自宅療養", "死亡", "調整中"]

In [36]:
s1.name = "陽性者の状況"

In [37]:
s1 = s1.str.replace(".", "").str.replace("\(最新日\)", "")

In [38]:
s1

日付          9月3日
退院又は勧告解除    1340
入院            78
重症             3
施設療養          34
自宅療養          28
死亡            25
調整中            7
Name: 陽性者の状況, dtype: object

In [40]:
s1.to_csv("main.csv", encoding="utf_8_sig")