This will do mostly the same as the `07-pipeline` but use some tricks to speed things up like multiprocessing as well as making sure memory does not overflow by processing the data in chunks.

In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
import pandas as pd
import numpy as np
import cv2 as cv
import sqlalchemy
from sqlalchemy import create_engine, text

import os
from enum import Enum
from glob import glob
import logging
from typing import List
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
import time

In [3]:
logging.basicConfig(filename="08-pipeline-multiprocessing.log", level=logging.INFO,
                    format="%(asctime)s %(levelname)-8s %(message)s")

In [22]:
engine_string = "mysql://bukanuser@localhost/bukan"

In [4]:
def log_progress(sequence, every=None, size=None, name='Items'):
    """From <https://github.com/kuk/log-progress>"""
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [46]:
overview = pd.read_csv("bukan-overview-final.csv", index_col=0)

## Processing Images

What I need to do now per image is:

1. Read, greyscale and crop all images
2. Split right/left page if necessary
3. Extract features

In [6]:
class Page(Enum):
    """Japanese reading order is from right to left."""
    whole = 0
    right = 1
    left  = 2

In [7]:
def crop_image(img):
    target_height = 660
    target_width = 990
    height, width = img.shape
    x1 = (width - target_width) // 2
    y1 = (height - target_height) // 2
    x2 = x1 + target_width
    y2 = y1 + target_height
    return img[y1:y2, x1:x2]

In [8]:
def read_image(path):
    img = cv.imread(path, flags=cv.IMREAD_REDUCED_GRAYSCALE_4)
    img = crop_image(img)
    return img

In [9]:
def split_image(img):
    height, width = img.shape
    assert width == 990
    half_width = width // 2
    return img[:, :half_width], img[:, half_width:]

In [10]:
def extract_page_nr_from_path(path):
    return int(path[-9:-4])

In [11]:
def write_image(image: np.ndarray, book_id: int, page_nr: int, page_enum: Page):
    path = f"output/images/{book_id}/{book_id}_{page_nr:0>5}_{page_enum.value}.jpg"
    assert cv.imwrite(path, image, [cv.IMWRITE_JPEG_QUALITY, 80, cv.IMWRITE_JPEG_OPTIMIZE, True])
    logging.info(f"Image written: {path}")

In [41]:
def descriptors_to_dataframe(descriptors: np.ndarray, book_id: int, page_nr: int, page_enum: Page):
    df = pd.DataFrame(descriptors)
    df.index = pd.MultiIndex.from_product([[book_id], [page_nr], [page_enum.value], df.index],
                                          names=["book", "page", "lr", "feature"])
    return df

In [42]:
def keypoints_to_dataframe(keypoints: List[cv.KeyPoint], book_id: int, page_nr: int, page_enum: Page):
    df = pd.DataFrame([(kp.pt[0], kp.pt[1], kp.size, kp.angle, kp.response, kp.octave, kp.class_id) for kp in keypoints],
                      columns=["x", "y", "size", "angle", "response", "octave", "class_id"])
    df.index = pd.MultiIndex.from_product([[book_id], [page_nr], [page_enum.value], df.index],
                                          names=["book", "page", "lr", "feature"])
    return df

In [40]:
def detect_features(image: np.ndarray, book_id: int, page_nr: int, page_enum: Page,
                    engine: sqlalchemy.engine.Engine, detector: cv.Feature2D):
    keypoints, descriptors = detector.detectAndCompute(image, None)
    if descriptors is None:
        logging.warning(f"No features detected for: {book_id}/{page_nr}/{page_enum.name}")
        return
    descriptors = descriptors_to_dataframe(descriptors, book_id, page_nr, page_enum)
    descriptors.to_sql("descriptor", engine, if_exists="append")
    logging.info(f"Descriptors written to database for: {book_id}/{page_nr}/{page_enum.name}")
    keypoints = keypoints_to_dataframe(keypoints, book_id, page_nr, page_enum)
    keypoints.to_sql("keypoint", engine, if_exists="append")
    logging.info(f"Keypoints written to database for: {book_id}/{page_nr}/{page_enum.name}")

In [39]:
def process_path(path, book_id: int, nr_pages_per_image: int, engine: sqlalchemy.engine.Engine,
                 detector: cv.Feature2D):
    page_nr = extract_page_nr_from_path(path)
    image = read_image(path)
    if nr_pages_per_image == 1:
        write_image(image, book_id, page_nr, Page.whole)
        detect_features(image, book_id, page_nr, Page.whole, engine, detector)
    elif nr_pages_per_image == 2:
        left_image, right_image = split_image(image)
        write_image(right_image, book_id, page_nr, Page.right)
        detect_features(right_image, book_id, page_nr, Page.right, engine, detector)
        write_image(left_image, book_id, page_nr, Page.left)
        detect_features(left_image, book_id, page_nr, Page.left, engine, detector)
    else:
        logging.warning(f"Strange number of pages per image for {path}: {nr_pages_per_image} (Skipping)")

In [38]:
def save_preprocessed_images_and_features(overview_df: pd.DataFrame, engine: sqlalchemy.engine.Engine,
                                          detector: cv.Feature2D):
    try:
        for book_id, book_metadata in log_progress(overview_df.iterrows(), every=1, size=len(overview_df), name="Rows"):
            os.makedirs(f"output/images/{str(book_id)}", exist_ok=True)
            nr_images = book_metadata["NrImages"]
            nr_pages_per_image = book_metadata["NrPages"]
            image_paths = glob(f"data/{book_id}/image/*.jpg")
            assert len(image_paths) == nr_images
            image_paths.sort()
            for path in image_paths:
                process_path(path, book_id, nr_pages_per_image, engine, detector)
    except Exception as e:
        logging.critical(str(e))
        raise e

In [49]:
engine = create_engine(engine_string)
akaze = cv.AKAZE_create(cv.AKAZE_DESCRIPTOR_MLDB_UPRIGHT, descriptor_size=0, threshold=0.005)
start_time = time.monotonic()
save_preprocessed_images_and_features(remaining, engine, akaze)
stop_time = time.monotonic()
engine.dispose()
print("All of this took:", stop_time - start_time, "seconds.")

VBox(children=(HTML(value=''), IntProgress(value=0, max=336)))

All of this took: 10390.628847228014 seconds.


## Processing Feature Pairs

First, I need to get all all book combinations as well as a fixed page offset. For each combination I need to run the full pipeline:

1. Find matching features
2. Filter features by their position
3. Compute the homography
4. Select features using the homography mask
4. **Don't threshold the features by their count**
5. Save them to disk

In [50]:
overview

Unnamed: 0_level_0,公開時期,オープンデータ分類,書名（統一書名）,TitleHiragana,TitleRomanji,刊・写,原本請求記号,刊年・書写年,（西暦）,冊数等,(単位),NrPages,Aspect,NrImages
国文研書誌ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200018466,H29.12,政治・法制,応仁武鑑,おうにんぶかん,ōninbukan,刊,ＭＹ－１４９０－７,天保１５,1844,2.0,冊,2,Portrait,78
200018476,H29.12,政治・法制,応仁武鑑,おうにんぶかん,ōninbukan,刊,ＭＹ－１４９０－８,弘化３,1846,3.0,冊,2,Portrait,100
200018713,H29.12,政治・法制,本朝武鑑,ほんちょうぶかん,honchōbukan,刊,ＭＹ－１２０１－５３,［貞享３］,[1686],1.0,冊,2,Landscape,75
200018714,H29.12,政治・法制,本朝武鑑,ほんちょうぶかん,honchōbukan,刊,ＭＹ－１２０１－５４,貞享３,1686,1.0,冊,2,Landscape,94
200018718,H29.12,政治・法制,太平武鑑,たいへいぶかん,taihenbukan,刊,ＭＹ－１２０１－５８,［元禄２］,[1689],1.0,冊,2,Landscape,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200019651,H29.12,政治・法制,御国分武鑑,おくにわけぶかん,okuniwakebukan,刊,ＭＹ－１２０１－３１５,慶応４,1868,1.0,冊,2,Landscape,28
200019652,H29.12,政治・法制,御国分武鑑,おくにわけぶかん,okuniwakebukan,刊,ＭＹ－１２０１－３１６,慶応４,1868,1.0,冊,2,Landscape,29
200019661,H29.12,政治・法制,昇栄武鑑,しょうえいぶかん,shōeibukan,刊,ＭＹ－１２０１－３２４,嘉永６,1853,1.0,冊,2,Landscape,124
200019662,H29.12,政治・法制,昇栄武鑑,しょうえいぶかん,shōeibukan,刊,ＭＹ－１２０１－３２５,安政３,1856,1.0,冊,2,Landscape,125


In [54]:
def run_sql_query(query: str):
    engine = create_engine(engine_string)
    with engine.connect() as conn:
        results = conn.execute(text(query)).fetchall()
    engine.dispose()
    return results

In [74]:
db_books = run_sql_query("select book, count(distinct page) from descriptor group by book")

In [75]:
(pd.DataFrame(db_books)[1].values <= overview["NrImages"].values).all()

True

In [76]:
len(db_books) == 336

True