In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import cv2 as cv
import os
from glob import glob
from enum import Enum
from itertools import chain, combinations
import time
from collections.abc import Iterable
from operator import itemgetter
import math

In [3]:
from helpers.features import show_image
from helpers.pipeline import applymapi, find_matches, select_keypoints, compute_homography_and_mask, sum_homography_mask, filter_out_bad_homographies

In [4]:
def log_progress(sequence, every=None, size=None, name='Items'):
    """From <https://github.com/kuk/log-progress>"""
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

## Read in and filter metadata

In [5]:
overview = pd.read_csv("bukan-overview.csv", index_col=1, dtype={
    "国文研書誌ID": str, "冊数等": str
})
overview = overview[overview["K?"] == "x"].drop("K?", 1).drop("Comment", 1).astype({"Pages per Scan": int})

In [6]:
def get_lists_of_images(overview_df):
    images_per_book = []
    for book_id in overview_df.index:
        book_id_path = os.path.join("data", str(book_id))
        assert os.path.exists(book_id_path)
        assert os.path.isdir(book_id_path)
        book_id_image_path = os.path.join(book_id_path, "image")
        assert os.path.exists(book_id_image_path)
        assert os.path.isdir(book_id_image_path)
        images = glob(f"{book_id_image_path}/*.jpg")
        images_per_book.append(images)
    return images_per_book
overview = overview.assign(Images=get_lists_of_images(overview))

In [7]:
overview = overview.assign(NrImages=overview["Images"].apply(len))

In [8]:
overview_filtered = overview.drop(["公開時期", "オープンデータ分類", "刊・写", "原本請求記号", "刊年・書写年", "（西暦）", "冊数等", "(単位)"], axis=1)

In [9]:
class Page(Enum):
    """Japanese reading order is from right to left."""
    unknown = 0
    right   = 1
    left    = 2

In [10]:
count_by_bukan = overview_filtered.groupby(["書名（統一書名）", "Pages per Scan"]).count()
count_by_bukan = count_by_bukan[count_by_bukan["Images"] > 1]

In [11]:
# Dropping two strange rows where the page format doesn't fit the rest of the Bukan editions
overview_filtered = overview_filtered.mask(
    (overview_filtered["書名（統一書名）"] == "有司武鑑") &
    (overview_filtered["Pages per Scan"] == 2)).dropna().astype({"NrImages":int, "Pages per Scan":int})

In [12]:
overview_bukan_count = overview_filtered.groupby(["書名（統一書名）"]).count()["Images"]
overview_bukan_count = overview_bukan_count[overview_bukan_count > 1]

In [13]:
# These are Bukan titles with more than one edition:
useful_bukan_titles = set(overview_bukan_count.index)

In [14]:
final_overview = overview_filtered[overview_filtered["書名（統一書名）"].isin(useful_bukan_titles)]

In [15]:
final_overview_by_title = final_overview.groupby(["書名（統一書名）"]).describe()["NrImages"]
final_overview_by_title

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
書名（統一書名）,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
万世武鑑,3.0,149.333333,49.692387,92.0,134.0,176.0,178.0,180.0
享保武鑑,6.0,251.666667,76.912071,96.0,271.25,279.5,285.5,299.0
享和武鑑,3.0,463.666667,9.0185,455.0,459.0,463.0,468.0,473.0
元文武鑑,2.0,289.0,4.242641,286.0,287.5,289.0,290.5,292.0
元禄武鑑大全,4.0,57.25,11.056672,44.0,53.0,57.0,61.25,71.0
大成武鑑,24.0,568.25,151.016627,81.0,574.75,643.5,648.25,666.0
天保武鑑,15.0,501.533333,93.670293,228.0,524.0,527.0,547.0,553.0
天明武鑑,7.0,379.571429,146.632031,51.0,405.0,435.0,449.0,463.0
太平武鑑,8.0,64.25,21.552262,48.0,50.5,52.0,73.25,103.0
安政武鑑,3.0,555.666667,6.429101,551.0,552.0,553.0,558.0,563.0


## Find features (descriptors and keypoints)

In [16]:
def crop_image(img):
    target_height = 660
    target_width = 990
    height, width = img.shape
    x1 = (width - target_width) // 2
    y1 = (height - target_height) // 2
    x2 = x1 + target_width
    y2 = y1 + target_height
    return img[y1:y2, x1:x2]

In [17]:
def read_image(path):
    img = cv.imread(path, flags=cv.IMREAD_REDUCED_GRAYSCALE_4)
    img = crop_image(img)
    return img

In [18]:
def split_image(img):
    height, width = img.shape
    assert width == 990
    return img[:,:width//2], img[:,width//2:]

In [19]:
def save_preprocessed_images_and_features(overview_df):
    """
    In theory, this large and ugly function only needs to run once.
    Here, features (descriptors and keypoints) for each image are
    calculated on a greyscaled and cropped input image.
    
    The image and the features are saved to disk. The latter are stored
    in Apache Parquet format.
    """
    detector = cv.AKAZE_create(cv.AKAZE_DESCRIPTOR_MLDB_UPRIGHT, descriptor_size=0, threshold=0.005)
    for book_id, book_metadata in log_progress(overview_df.iterrows(), every=1, size=len(overview_df)):
        os.makedirs(f"data/grey/{str(book_id)}/image", exist_ok=True)
        pps = book_metadata["Pages per Scan"]
        image_paths = book_metadata["Images"]
        keypoints_list = []
        descriptors_list = []
        index_col_page = []
        index_col_type = []
        index_col_i = []
        
        # This part is for processing the images and saving them,
        # as well as for calculating the features
        for page_i, image_path in enumerate(image_paths):
            image = read_image(image_path)
            assert (pps == 1) or (pps == 2)
            if pps == 1:  # It's just one page
                assert cv.imwrite(f"data/grey/{str(book_id)}/image/{str(book_id)}_{page_i+1:0>5}_{Page.unknown.value}.jpg",
                                  image, [cv.IMWRITE_JPEG_QUALITY, 80, cv.IMWRITE_JPEG_OPTIMIZE, True])
                keypoints, descriptors = detector.detectAndCompute(image, None)
                if descriptors is not None:
                    assert len(keypoints) == descriptors.shape[0]
                    keypoints_list.append(keypoints)
                    descriptors_list.append(descriptors)
                    index_col_page.extend([page_i + 1] * descriptors.shape[0])
                    index_col_type.extend([Page.unknown.value] * descriptors.shape[0])
                    index_col_i.extend(list(range(descriptors.shape[0])))
            else:  # Two pages per scanned image -> split it!
                image_left, image_right = split_image(image)
                assert cv.imwrite(f"data/grey/{str(book_id)}/image/{str(book_id)}_{page_i+1:0>5}_{Page.left.value}.jpg",
                                  image_left, [cv.IMWRITE_JPEG_QUALITY, 80, cv.IMWRITE_JPEG_OPTIMIZE, True])
                assert cv.imwrite(f"data/grey/{str(book_id)}/image/{str(book_id)}_{page_i+1:0>5}_{Page.right.value}.jpg",
                                  image_right, [cv.IMWRITE_JPEG_QUALITY, 80, cv.IMWRITE_JPEG_OPTIMIZE, True])
                keypoints_right, descriptors_right = detector.detectAndCompute(image_right, None)
                if descriptors_right is not None:
                    assert len(keypoints_right) == descriptors_right.shape[0]
                    keypoints_list.append(keypoints_right)
                    descriptors_list.append(descriptors_right)
                    index_col_page.extend([page_i + 1] * descriptors_right.shape[0])
                    index_col_type.extend([Page.right.value] * descriptors_right.shape[0])
                    index_col_i.extend(list(range(descriptors_right.shape[0])))
                keypoints_left, descriptors_left = detector.detectAndCompute(image_left, None)
                if descriptors_left is not None:
                    assert len(keypoints_left) == descriptors_left.shape[0]
                    keypoints_list.append(keypoints_left)
                    descriptors_list.append(descriptors_left)
                    index_col_page.extend([page_i + 1] * descriptors_left.shape[0])
                    index_col_type.extend([Page.left.value] * descriptors_left.shape[0])
                    index_col_i.extend(list(range(descriptors_left.shape[0])))
        index = pd.MultiIndex.from_arrays(
            [index_col_page, index_col_type, index_col_i],
            names=["page", "lr", "feature"])
        descriptors_df = pd.DataFrame(np.concatenate(descriptors_list), index=index)
        descriptors_df.columns = descriptors_df.columns.map(str)
        keypoints_df = pd.Series(chain.from_iterable(keypoints_list), index=index)
        keypoints_df = pd.concat([
            keypoints_df.apply(lambda x: x.pt[0]),
            keypoints_df.apply(lambda x: x.pt[1]),
            keypoints_df.apply(lambda x: x.size),
            keypoints_df.apply(lambda x: x.angle),
            keypoints_df.apply(lambda x: x.response),
            keypoints_df.apply(lambda x: x.octave),
            keypoints_df.apply(lambda x: x.class_id)
        ], axis=1)
        keypoints_df.columns = ["x", "y", "size", "angle", "response", "octave", "class_id"]
        assert len(keypoints_df) == len(descriptors_df)
        descriptors_df.to_parquet(f"data/grey/{str(book_id)}/descriptors.parquet", engine="pyarrow")
        keypoints_df.to_parquet(f"data/grey/{str(book_id)}/keypoints.parquet", engine="pyarrow")

## Find matching features

In [20]:
def build_page_index(page_ids, nr_pages_df):
    index = []
    for book1_id, book2_id in combinations(page_ids, 2):
        for page in range(1, nr_pages_df[book1_id] + 1):
            index.append((book1_id, book2_id, page))
    return pd.MultiIndex.from_tuples(index, names=("Book1", "Book2", "Page"))

In [21]:
def build_page_df(bukan_df, radius=8):
    nr_pages = bukan_df["NrImages"]
    index = build_page_index(bukan_df.index, nr_pages)
    diameter = radius * 2 + 1
    page_array = np.empty((len(index), diameter), dtype=np.int32)
    for i, (_, book2_id, page) in enumerate(index):
        first_page = page - radius
        last_page = page + radius
        book2_nr_pages = nr_pages[book2_id]
        def handle_borders(n):
            if n < 0:
                return 0
            elif n > book2_nr_pages:
                return 0
            else:
                return n
        pages = np.fromiter((handle_borders(p) for p in range(first_page, last_page + 1)),
                            count=diameter, dtype=np.int32)
        page_array[i,:] = pages
    columns = pd.RangeIndex(-radius, radius+1)
    return pd.DataFrame(page_array, index=index, columns=columns)

In [22]:
def descriptors_df_to_series(descriptors_df, page_enum: Page):
    descriptors_df = descriptors_df.swaplevel('lr', 'page').loc[page_enum.value]
    pages = descriptors_df.index.get_level_values(0).unique()
    return pd.Series([descriptors_df.loc[page].values for page in pages], index=pages)
def read_descriptor_dict(book_ids, page_enum: Page):
    desc_dict = {
        book_id:descriptors_df_to_series(
            pd.read_parquet(f"data/grey/{str(book_id)}/descriptors.parquet", engine="pyarrow"),
            page_enum)
        for book_id in book_ids}
    return desc_dict

In [23]:
def keypoints_of_page_to_list(keypoints_of_page):
    return [cv.KeyPoint(*args) for args in zip(*(column for _, column in keypoints_of_page.items()))]
def keypoints_df_to_series(keypoints_df, page_enum: Page):
    keypoints_df = keypoints_df.swaplevel('lr', 'page').loc[page_enum.value]
    pages = keypoints_df.index.get_level_values(0).unique()
    return pd.Series([keypoints_of_page_to_list(keypoints_df.loc[page]) for page in pages], index=pages)
def read_keypoints_dict(book_ids, page_enum: Page):
    kps_dict = {
        book_id:keypoints_df_to_series(
            pd.read_parquet(f"data/grey/{str(book_id)}/keypoints.parquet", engine="pyarrow"),
            page_enum)
        for book_id in book_ids}
    return kps_dict

In [24]:
wradius = 200.
hradius = 200.
def is_near(pt1, pt2):
    x1, y1 = pt1
    x2, y2 = pt2
    return ((x1 - wradius) <= x2 <= (x1 + wradius)) and ((y1 - hradius) <= y2 <= (y1 + hradius))

In [25]:
def get_relevant_matches(row_index, nr_matches,
                         filtered_masks_df, selected_keypoints_mask_df, matches_df, page_df):
    book1_id, book2_id, book1_page, book2_offset = row_index
    loc = (book1_id, book2_id, book1_page)
    book2_page = page_df.loc[loc, book2_offset]
    
    filtered_mask = filtered_masks_df.loc[loc, book2_offset]
    selected_keypoints_mask = selected_keypoints_mask_df.loc[loc, book2_offset]
    assert selected_keypoints_mask.sum() == filtered_mask.size
    
    kp_mask = np.empty_like(selected_keypoints_mask, dtype=np.bool)
    counter = 0
    for i, kp_bool in enumerate(selected_keypoints_mask):
        if kp_bool:
            kp_mask[i] = filtered_mask[counter]
            counter += 1
        else:
            kp_mask[i] = False
    assert counter == selected_keypoints_mask.sum()
    
    matches = matches_df.loc[loc, book2_offset]
    assert len(matches) == kp_mask.size
    relevant_matches = [matches[i] for i, val in enumerate(kp_mask) if val]
    assert len(relevant_matches) == kp_mask.sum()
    
    index = pd.MultiIndex.from_product(
        [[book1_id], [book1_page], [book2_id], [book2_page], pd.RangeIndex(len(relevant_matches))],
        names=["book1", "page1", "book2", "page2", "match"])
    
    assert nr_matches == len(relevant_matches)
    return pd.DataFrame([(m.queryIdx, m.trainIdx, m.distance) for m in relevant_matches],
                        columns=["queryIdx", "trainIdx", "distance"], index=index)

In [26]:
def single_pipeline_step(bukan_title, page_type: Page, matcher: cv.BFMatcher, final_threshold: int):
    print("Processing:", bukan_title)
    basename = bukan_title + "_" + page_type.name
    filename = basename + ".parquet"
    filepath = os.path.join("data/grey", filename)
    if os.path.exists(filepath):
        print("Results already exist:", filepath, "(Skipping)")
        return
    subset = final_overview[final_overview["書名（統一書名）"] == bukan_title]
    subset = subset.sort_values("NrImages", ascending=False)
    subset_page_df = build_page_df(subset)
    subset_page_df = subset_page_df[subset_page_df.sum(axis=1) != 0]
    nr_combinations = subset_page_df.stack()[subset_page_df.stack() != 0].size
    if nr_combinations > 2000000:
        print("Too many page combinations:", nr_combinations, "Skipping")
        return
    else:
        print("Number of page combinations:", nr_combinations)
    try:
        desc_dict = read_descriptor_dict(subset.index, page_type)
        kps_dict = read_keypoints_dict(subset.index, page_type)
    except KeyError:
        print("No pages with this orientation found:", page_type.name, "(Skipping)")
        return
    del subset
    match_time_s, matches_df = applymapi(subset_page_df, find_matches,
                                         max_distance=100, descriptors=desc_dict,
                                         matcher=matcher)
    del desc_dict
    nr_machtes_total = matches_df.applymap(lambda x: len(x) if isinstance(x, Iterable) else 0).sum().sum()
    print("Total number of matches found:", nr_machtes_total)
    select_time_s, selected_keypoints_df = applymapi(
        subset_page_df, select_keypoints,
        matches=matches_df, keypoints=kps_dict, filter=is_near)
    del kps_dict
    homography_time_s, (homography_df, hmask_df) = compute_homography_and_mask(
        selected_keypoints_df.applymap(itemgetter(0)))
    filter_time, filtered_masks_df = filter_out_bad_homographies(homography_df, hmask_df)
    del homography_df
    del hmask_df
    hmask_sum_df = sum_homography_mask(filtered_masks_df)
    flattened_sums = hmask_sum_df.stack().astype(np.int32)
    thresholded_sums = flattened_sums[flattened_sums > final_threshold]
    try:
        relevant_matches_df = pd.concat([
            get_relevant_matches(row_index, nr_matches, filtered_masks_df,
                selected_keypoints_df.applymap(itemgetter(1)),
                matches_df, subset_page_df)
            for row_index, nr_matches in thresholded_sums.items()
        ])
    except ValueError as e:
        print(e)
        print("Skipping")
        return
    relevant_matches_df.to_parquet(filepath, engine="pyarrow")
    print("Results written to:", filename)
    total_time = match_time_s + select_time_s + homography_time_s + filter_time
    print(f"Total time: {total_time} seconds ({total_time/60} minutes).")

In [27]:
def run_pipeline(page_type: Page):
    matcher = cv.BFMatcher_create(normType=cv.NORM_HAMMING)
    final_threshold = 40
    for bukan_title in log_progress(final_overview_by_title.index, every=1, size=len(final_overview_by_title.index)):
        single_pipeline_step(bukan_title, page_type, matcher, final_threshold)

In [28]:
def get_nr_page_combinations(bukan_title, page_type: Page):
    subset = final_overview[final_overview["書名（統一書名）"] == bukan_title]
    subset = subset.sort_values("NrImages", ascending=False)
    subset_page_df = build_page_df(subset)
    subset_page_df = subset_page_df[subset_page_df.sum(axis=1) != 0]
    nr_combinations = subset_page_df.stack()[subset_page_df.stack() != 0].size
    try:
        desc_dict = read_descriptor_dict(subset.index, page_type)
        kps_dict = read_keypoints_dict(subset.index, page_type)
    except KeyError:
        return 0
    return nr_combinations

In [29]:
def get_page_combination_df():
    results = {}
    for page_type in [Page.right, Page.left, Page.unknown]:
        results_per_title = {}
        for bukan_title in final_overview_by_title.index:
            results_per_title[bukan_title] = get_nr_page_combinations(bukan_title, page_type)
        results[page_type.name] = results_per_title
    return pd.DataFrame(results)

# Postprocessing

In [30]:
all_match_df_paths = glob("data/grey/*.parquet")

In [31]:
def get_title_and_enum_from_path(path):
    _, filename = os.path.split(path)
    basename, _ = os.path.splitext(filename)
    bukan_title, page_enum = basename.split("_")
    return bukan_title, Page[page_enum]

In [32]:
def expand_index(bukan_title, page_enum, matches_df):
    """Does work inline"""
    index_df = matches_df.index.to_frame()
    index_df.insert(0, "lr", page_enum.value)
    index_df.insert(0, "title", bukan_title)
    new_index = pd.MultiIndex.from_frame(index_df)
    matches_df.index = new_index

In [33]:
all_match_dfs = []
for match_df_path in all_match_df_paths:
    bukan_title, page_enum = get_title_and_enum_from_path(match_df_path)
    match_df = pd.read_parquet(match_df_path)
    expand_index(bukan_title, page_enum, match_df)
    all_match_dfs.append(match_df)
all_match_dfs = pd.concat(all_match_dfs)

In [34]:
all_match_dfs = all_match_dfs.sort_index()

In [35]:
matched_book_ids = set(all_match_dfs.index.levels[2]) | set(all_match_dfs.index.levels[4])
match_overview = overview.drop("Images", axis=1)
match_overview = match_overview[match_overview.index.isin(matched_book_ids)]

In [36]:
hiragana_mapping = {
    'ほんちょうぶかん': "honchōbukan",
    'たいへいぶかん': "taihenbukan",
    'たいへいぶかんたいぜん': "taihenbukantaizen",
    'せいとうぶかん': "seitōbukan",
    'げんろくぶかんたいぜん': "genrokubukantaizen",
    'ほうえいぶかんたいせい': "hōeibukantaisen",
    'ごりんぶかん': "gorinbukan",
    'ほうえいぶかん': "hōeibukan",
    'しょうふうぶかん': "shōfūbukan",
    'しょうえんぶかん': "shōenbukan",
    'しょうとくぶかん': "shōtokubukan",
    'きょうほうぶかん': "kyōhōbukan",
    'えいせいぶかん': "eiseibukan", 
    'げんぶんぶかん': "genbunbukan",
    'かんぽうぶかん': "kanpōbukan",
    'えんきょうぶかん': "enkyōbukan",
    'かんえんぶかん': "kanenbukan",
    'ゆうしぶかん': "yūshibukan",
    'ほうりゃくぶかん': "hōryakubukan",
    'たいせいぶかん': "taiseibukan",
    'めいわぶかん': "meiwabukan",
    'たいへいりゃくぶかん': "taiheiryakubukan",
    'しゅうぎょくぶかん': "shūgyokubukan",
    'あんえいぶかん': "aneibukan",
    'しゅうちんぶかん': "shūchinbukan",
    'てんめいぶかん': "tenmeibukan",
    'まんじゅぶかん': "manjubukan",
    'かんせいぶかん': "kanseibukan",
    'きょうわぶかん': "kyōwabukan",
    'ぶんかぶかん': "bunkabukan",
    'ぶんせいぶかん': "bunseibukan",
    'かまくらぶかん': "kamakurabukan",
    'かいほうりゃくぶかん': "kaihōryakubukan", 
    'てんぽうぶかん': "tenpōbukan",
    'おうにんぶかん': "ōninbukan",
    'こうかぶかん': "kōkabukan",
    'ばんせいぶかん': "banseibukan",
    'かえいぶかん': "kaeibukan",
    'しょうえいぶかん': "shōeibukan",
    'あんせいぶかん': "anseibukan",
    'ぶんきゅうぶかん': "bunkyūbukan",
    'けいおうぶかん': "keiōbukan",
    'おくにわけぶかん': "okuniwakebukan",
    'かいほうおくにわけりゃくぶかん': "kaihō okuniwake ryakubukan"
}

In [37]:
label_mapping = {
     '鎌倉武鑑': 'かまくらぶかん',
     '応仁武鑑': 'おうにんぶかん',
     '本朝武鑑': 'ほんちょうぶかん',
     '太平武鑑': 'たいへいぶかん',
     '太平武鑑大全': 'たいへいぶかんたいぜん',
     '正統武鑑': 'せいとうぶかん',
     '元禄武鑑大全': 'げんろくぶかんたいぜん',
     '宝永武鑑大成': 'ほうえいぶかんたいせい',
     '御林武鑑': 'ごりんぶかん',
     '宝永武鑑': 'ほうえいぶかん',
     '正風武鑑': 'しょうふうぶかん',
     '賞延武鑑': 'しょうえんぶかん',
     '正徳武鑑': 'しょうとくぶかん',
     '享保武鑑': 'きょうほうぶかん',
     '永世武鑑': 'えいせいぶかん',
     '元文武鑑': 'げんぶんぶかん',
     '寛保武鑑': 'かんぽうぶかん',
     '延享武鑑': 'えんきょうぶかん',
     '寛延武鑑': 'かんえんぶかん',
     '宝暦武鑑': 'ほうりゃくぶかん',
     '大成武鑑': 'たいせいぶかん',
     '明和武鑑': 'めいわぶかん',
     '安永武鑑': 'あんえいぶかん',
     '天明武鑑': 'てんめいぶかん',
     '寛政武鑑': 'かんせいぶかん',
     '享和武鑑': 'きょうわぶかん',
     '文化武鑑': 'ぶんかぶかん',
     '文政武鑑': 'ぶんせいぶかん',
     '天保武鑑': 'てんぽうぶかん',
     '弘化武鑑': 'こうかぶかん',
     '嘉永武鑑': 'かえいぶかん',
     '安政武鑑': 'あんせいぶかん',
     '文久武鑑': 'ぶんきゅうぶかん',
     '有司武鑑': 'ゆうしぶかん',
     '太平略武鑑': 'たいへいりゃくぶかん',
     '袖玉武鑑': 'しゅうぎょくぶかん',
     '袖珍武鑑': 'しゅうちんぶかん',
     '万寿武鑑': 'まんじゅぶかん',
     '懐宝略武鑑': 'かいほうりゃくぶかん',
     '万世武鑑': 'ばんせいぶかん',
     '御国分武鑑': 'おくにわけぶかん',
     '懐宝御国分略武鑑': 'かいほうおくにわけりゃくぶかん',
     '昇栄武鑑': 'しょうえいぶかん',
     '慶応武鑑': 'けいおうぶかん'
}

In [38]:
match_overview = match_overview.assign(TitleHiragana=match_overview["書名（統一書名）"].apply(lambda x: label_mapping[x]))
match_overview = match_overview.assign(TitleRomanji=match_overview["TitleHiragana"].apply(lambda x: hiragana_mapping[x]))

In [39]:
match_overview.index.name = "BookID"
match_overview.columns = ["Release", "Classification", "Title", "Type", "RequestID", "Publication", "Year",
                          "Count", "Unit", "PagesPerScan", "Aspect", "Scans", "TitleHiragana", "TitleRomanji"]

In [40]:
match_title_overview = match_overview.groupby(["Title", "TitleHiragana", "TitleRomanji", "Aspect", "PagesPerScan"]).count()["Count"]

In [49]:
def merge_matches_and_keypoints():
    new_matches = {}
    for bukan_title in log_progress(all_match_dfs.index.get_level_values(0).unique(), every=1, name="Bukan Title"):
        match_by_title = all_match_dfs.loc[bukan_title]
        match_book_ids = set(match_by_title.index.get_level_values(1)) | set(match_by_title.index.get_level_values(3))
        match_book_kps = {book_id:pd.read_parquet(f"data/grey/{book_id}/keypoints.parquet") for book_id in match_book_ids}
        enum_dict = {}
        for page_enum_value in match_by_title.index.get_level_values(0).unique():
            match_by_title_and_enum = match_by_title.loc[page_enum_value]
            selected_matches_list = []
            for index in log_progress(match_by_title_and_enum.index.droplevel("match").unique(), every=100, name="Page matches"):
                (book1_id, book1_page, book2_id, book2_page) = index
                selected_matches = match_by_title_and_enum.loc[index]
                keypoints1 = match_book_kps[book1_id].loc[(book1_page, page_enum_value)]
                src_kps1 = selected_matches["queryIdx"].apply(lambda x: keypoints1.loc[x]).add_prefix("src_")
                keypoints2 = match_book_kps[book2_id].loc[(book2_page, page_enum_value)]
                dst_kps2 = selected_matches["trainIdx"].apply(lambda x: keypoints2.loc[x]).add_prefix("dst_")
                selected_matches = pd.concat([selected_matches, src_kps1, dst_kps2], sort=False, axis=1)
                selected_matches.index = pd.MultiIndex.from_tuples([index + (match_id,) for match_id in selected_matches.index])
                selected_matches_list.append(selected_matches)
            enum_dict[page_enum_value] = pd.concat(selected_matches_list)
        new_matches[bukan_title] = pd.concat(enum_dict)
    new_matches = pd.concat(new_matches)
    new_matches.index.names = all_match_dfs.index.names
    new_matches = new_matches.astype({
        'queryIdx': np.uint32,
        'trainIdx': np.uint32,
        'distance': np.float32,
        'src_x': np.float32,
        'src_y': np.float32,
        'src_size': np.float32,
        'src_angle': np.float32,
        'src_response': np.float32,
        'src_octave': np.uint8,
        'src_class_id': np.uint8,
        'dst_x': np.float32,
        'dst_y': np.float32,
        'dst_size': np.float32,
        'dst_angle': np.float32,
        'dst_response': np.float32,
        'dst_octave': np.uint8,
        'dst_class_id': np.uint8
    })
    return new_matches.sort_index()

In [48]:
#new_matches.to_parquet("output/grey_matches.parquet.gzip", engine="pyarrow", compression="gzip")

In [52]:
#match_overview.to_parquet("output/grey_overview.parquet.gzip", engine="pyarrow", compression="gzip")

In [43]:
all_match_dfs.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,queryIdx,trainIdx,distance
title,lr,book1,page1,book2,page2,match,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
文化武鑑,1,200018871,269,200018866,269,102,227,165,47.0
有司武鑑,0,200019602,116,200019580,112,208,1113,629,98.0
天保武鑑,2,200018923,422,200018921,420,18,34,28,85.0
文政武鑑,2,200018896,431,200018887,430,82,219,157,55.0
有司武鑑,0,200019557,86,200019571,86,53,158,231,60.0


In [86]:
img_src = cv.imread("data/grey/200018871/image/200018871_00269_1.jpg", cv.IMREAD_GRAYSCALE)
img_dst = cv.imread("data/grey/200018866/image/200018866_00269_1.jpg", cv.IMREAD_GRAYSCALE)

In [56]:
match_data = new_matches.loc[("文化武鑑", 1, 200018871, 269, 200018866, 269)]

In [78]:
src_kps = np.array([(x, y) for x, y in zip(match_data["src_x"], match_data["src_y"])])
dst_kps = np.array([(x, y) for x, y in zip(match_data["dst_x"], match_data["dst_y"])])

In [80]:
homography, _ = cv.findHomography(src_kps, dst_kps, 0)

In [91]:
dst_height, dst_width = img_dst.shape
img_src_warped = cv.warpPerspective(img_src, homography, (dst_width, dst_height))

In [112]:
channel_names = ["blue", "green", "red"]

In [113]:
for channel in range(3):
    bgr_src = cv.cvtColor(img_src_warped, cv.COLOR_GRAY2BGR)
    bgr_dst = cv.cvtColor(img_dst, cv.COLOR_GRAY2BGR)
    bgr_dst[:,:,channel] = bgr_src[:,:,channel]
    assert cv.imwrite(f"200018866_269_200018871_269_{channel_names[channel]}.png", bgr_dst)

In [114]:
for channel in range(3):
    bgr_src = cv.cvtColor(img_src_warped, cv.COLOR_GRAY2BGR)
    bgr_dst = cv.cvtColor(img_dst, cv.COLOR_GRAY2BGR)
    bgr_src[:,:,channel] = bgr_dst[:,:,channel]
    assert cv.imwrite(f"200018871_269_200018866_269_{channel_names[channel]}.png", bgr_src)

In [115]:
assert cv.imwrite("200018866_269_200018871_269_src.png", img_src_warped)
assert cv.imwrite("200018866_269_200018871_269_dst.png", img_dst)