<a href="https://colab.research.google.com/github/imabari/covid19-data/blob/master/aichi/aichi_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!add-apt-repository ppa:alex-p/tesseract-ocr -y

In [0]:
!apt update

In [0]:
!apt install tesseract-ocr
!apt install libtesseract-dev

In [0]:
!tesseract -v

In [0]:
!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert

In [0]:
!tesseract --list-langs

# スクレイピング

In [0]:
import pathlib
import re

In [0]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [0]:
url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

In [0]:
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

In [0]:
src = soup.find("img", alt=re.compile("検査陽性者$")).get("src")

In [0]:
link = urljoin(url, src)
link

# ダウンロード

In [0]:
def get_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode='wb') as fw:
        fw.write(r.content)

    return p

In [0]:
jpg_path = get_file(link)

# pytesseract

In [0]:
!pip install pytesseract

In [0]:
try:
    from PIL import Image
except ImportError:
    import Image
    
import pytesseract

In [0]:
import cv2
import numpy as np

In [0]:
from google.colab.patches import cv2_imshow

In [0]:
# img = cv2.imread(str(jpg_path))
# img, _ = cv2.decolor(img)

In [0]:
src = cv2.imread(str(jpg_path))

img = cv2.inRange(src, (150, 150, 100), (255, 255, 255))

In [0]:
# img = cv2.imread(str(jpg_path), 0)

In [0]:
# 範囲指定
img_crop = img[0:250]

In [0]:
txt = pytesseract.image_to_string(img_crop, lang="jpn", config="--psm 6").replace(".", "")

In [0]:
print(txt)

In [0]:
data = [int(i) for i in re.findall("[0-9]+", txt)]

In [0]:
result = []

while(len(data) >= 9):

    if data[2] == data[3] + data[4]:
        if data[1] == data[2] + data[5] + data[6] + data[7] + data[8]:
            result = data[:9]
            break

    data.pop(0)

# CSV

In [0]:
import datetime

In [0]:
import csv

In [0]:
dt_match = re.search("(\d{4})年(\d{1,2})月(\d{1,2})日(\d{1,2})時", txt)

In [0]:
if result:

    if dt_match:
        y, m, d, h = map(int, dt_match.groups())
        dt_update = datetime.datetime(y, m, d, h).strftime("%Y%m%d%H00")
    else:
        dt_update = datetime.datetime.now().strftime("%Y%m%d%H00")

    p = pathlib.Path(f'{dt_update}.csv')

    with p.open(mode='w') as fw:
        writer = csv.writer(fw)
        writer.writerow([dt_update] + result)

    print(result)

    jpg_path.rename(f"{dt_update}.jpg")

else:
    print("見つかりません")   

In [0]:
# 確認
cv2_imshow(img_crop)