In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import cv2 as cv
import os.path
from glob import glob
from itertools import combinations, chain
from math import ceil
from operator import itemgetter

In [3]:
from helpers.features import read_offset_csv, transform_offsets_to_paths, show_image, precision_and_recall, timeit
from helpers.pipeline import read_image, detect_and_extract, filter_out_bad_homographies
from helpers.pipeline import applymapi, find_matches, select_keypoints, compute_homography_and_mask, sum_homography_mask, calculate_metrics

In [4]:
overview = pd.read_csv("bukan-overview.csv", index_col=1, dtype={
    "国文研書誌ID": str, "冊数等": str
})

In [5]:
data = overview[overview["K?"] == "x"].drop("K?", 1).drop("Comment", 1).astype({"Pages per Scan": int})

In [6]:
images_per_book = []
for book_id in data.index:
    book_id_path = os.path.join("data", str(book_id))
    assert os.path.exists(book_id_path)
    assert os.path.isdir(book_id_path)
    book_id_image_path = os.path.join(book_id_path, "image")
    assert os.path.exists(book_id_image_path)
    assert os.path.isdir(book_id_image_path)
    images = glob(f"{book_id_image_path}/*.jpg")
    images_per_book.append(images)

In [7]:
data = data.assign(Images=images_per_book)

In [8]:
data

Unnamed: 0_level_0,公開時期,オープンデータ分類,書名（統一書名）,刊・写,原本請求記号,刊年・書写年,（西暦）,冊数等,(単位),Pages per Scan,Aspect,Images
国文研書誌ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
200018462,H29.12,政治・法制,鎌倉武鑑,刊,ＭＹ－１４９０－４,文政２,1819,1,冊,2,Portrait,"[data\200018462\image\200018462_00001.jpg, dat..."
200018466,H29.12,政治・法制,応仁武鑑,刊,ＭＹ－１４９０－７,天保１５,1844,2,冊,2,Portrait,"[data\200018466\image\200018466_00001.jpg, dat..."
200018476,H29.12,政治・法制,応仁武鑑,刊,ＭＹ－１４９０－８,弘化３,1846,3,冊,2,Portrait,"[data\200018476\image\200018476_00001.jpg, dat..."
200018713,H29.12,政治・法制,本朝武鑑,刊,ＭＹ－１２０１－５３,［貞享３］,[1686],1,冊,2,Landscape,"[data\200018713\image\200018713_00001.jpg, dat..."
200018714,H29.12,政治・法制,本朝武鑑,刊,ＭＹ－１２０１－５４,貞享３,1686,1,冊,2,Landscape,"[data\200018714\image\200018714_00001.jpg, dat..."
...,...,...,...,...,...,...,...,...,...,...,...,...
200019654,H29.12,政治・法制,懐宝御国分略武鑑,刊,ＭＹ－１２０１－３１８,慶応４,1868,1,冊,2,Landscape,"[data\200019654\image\200019654_00001.jpg, dat..."
200019661,H29.12,政治・法制,昇栄武鑑,刊,ＭＹ－１２０１－３２４,嘉永６,1853,1,冊,2,Landscape,"[data\200019661\image\200019661_00001.jpg, dat..."
200019662,H29.12,政治・法制,昇栄武鑑,刊,ＭＹ－１２０１－３２５,安政３,1856,1,冊,2,Landscape,"[data\200019662\image\200019662_00001.jpg, dat..."
200019666,H29.12,政治・法制,昇栄武鑑,刊,ＭＹ－１２０１－３２７,元治１,1864,1,冊,2,Landscape,"[data\200019666\image\200019666_00001.jpg, dat..."


In [9]:
count_by_bukan = data.groupby(["書名（統一書名）", "Pages per Scan"]).count()
count_by_bukan = count_by_bukan[count_by_bukan["Images"] > 1]
count_by_bukan

Unnamed: 0_level_0,Unnamed: 1_level_0,公開時期,オープンデータ分類,刊・写,原本請求記号,刊年・書写年,（西暦）,冊数等,(単位),Aspect,Images
書名（統一書名）,Pages per Scan,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
万世武鑑,1,3,3,3,3,3,3,3,3,3,3
享保武鑑,2,6,6,6,6,6,6,6,6,6,6
享和武鑑,2,3,3,3,3,3,3,3,3,3,3
元文武鑑,2,2,2,2,2,2,2,2,2,2,2
元禄武鑑大全,2,4,4,4,4,4,4,4,4,4,4
大成武鑑,2,24,24,24,24,24,24,24,24,24,24
天保武鑑,2,15,15,15,15,15,15,15,15,15,15
天明武鑑,2,7,7,7,7,7,7,5,5,7,7
太平武鑑,2,8,8,8,8,8,8,8,8,8,8
安政武鑑,2,3,3,3,3,3,3,3,3,3,3


In [10]:
def build_page_index(page_ids, nr_pages_df):
    index = []
    for book1_id, book2_id in combinations(page_ids, 2):
        for page in range(1, nr_pages_df[book1_id] + 1):
            index.append((book1_id, book2_id, page))
    return pd.MultiIndex.from_tuples(index, names=("Book1", "Book2", "Page"))

In [21]:
def build_page_df(bukan_df, radius_factor=0.025):
    nr_pages = bukan_df["Images"].apply(len)
    index = build_page_index(bukan_df.index, nr_pages)
    
    for book1_id, book2_id, page in index:
        assert isinstance(book1_id, int)
        assert isinstance(book2_id, int)
        assert isinstance(page, int)
    
    min_nr_pages = nr_pages.min()
    max_nr_pages = nr_pages.max()
    min_radius = (max_nr_pages - min_nr_pages) // 2
    radius = min_radius + ceil(max_nr_pages * radius_factor)
    diameter = radius * 2 + 1
    page_array = np.empty((len(index), diameter), dtype=np.int32)
    for i, (_, book2_id, page) in enumerate(index):
        first_page = page - radius
        last_page = page + radius
        book2_nr_pages = nr_pages[book2_id]
        def handle_borders(n):
            if n < 0:
                return 0
            elif n > book2_nr_pages:
                return 0
            else:
                return n
        pages = np.fromiter((handle_borders(p) for p in range(first_page, last_page + 1)),
                            count=diameter, dtype=np.int32)
        page_array[i,:] = pages
    columns = pd.RangeIndex(-radius, radius+1)
    return pd.DataFrame(page_array, index=index, columns=columns)

In [None]:
detector = cv.AKAZE_create(cv.AKAZE_DESCRIPTOR_MLDB_UPRIGHT, descriptor_size=0, threshold=0.005)
for book_id, image_paths in data["Images"].items():
    break
    print("Processing book:", book_id)
    features = map(lambda path: detector.detectAndCompute(read_image(path), None), image_paths)
    keypoints, descriptors = zip(*features)
    
    index = []
    for i, arr in enumerate(descriptors):
        if arr is not None:
            for j in range(arr.shape[0]):
                index.append((i + 1, j))
    index = pd.MultiIndex.from_tuples(index, names=["page", "feature"])
    
    descriptors_df = pd.DataFrame(np.concatenate([d for d in descriptors if d is not None]), index=index)
    descriptors_df.columns = descriptors_df.columns.map(str)
    
    keypoints_df = pd.Series(chain.from_iterable(keypoints), index=index)
    keypoints_df = pd.concat([
        keypoints_df.apply(lambda x: x.pt[0]),
        keypoints_df.apply(lambda x: x.pt[1]),
        keypoints_df.apply(lambda x: x.size),
        keypoints_df.apply(lambda x: x.angle),
        keypoints_df.apply(lambda x: x.response),
        keypoints_df.apply(lambda x: x.octave),
        keypoints_df.apply(lambda x: x.class_id)
    ], axis=1)
    keypoints_df.columns = ["x", "y", "size", "angle", "response", "octave", "class_id"]
    
    assert len(keypoints_df) == len(descriptors_df)
    
    descriptors_df.to_parquet(f"data/{str(book_id)}/descriptors.parquet", engine="pyarrow")
    keypoints_df.to_parquet(f"data/{str(book_id)}/keypoints.parquet", engine="pyarrow")

In [12]:
for book_id, _ in data["Images"].items():
    assert os.path.exists(f"data/{str(book_id)}/descriptors.parquet")
    assert os.path.exists(f"data/{str(book_id)}/keypoints.parquet")

In [13]:
matcher = cv.BFMatcher_create(normType=cv.NORM_HAMMING)

In [14]:
wradius = 280.
hradius = 164.
def is_near(pt1, pt2):
    x1, y1 = pt1
    x2, y2 = pt2
    return ((x1 - wradius) <= x2 <= (x1 + wradius)) and ((y1 - hradius) <= y2 <= (y1 + hradius))

In [15]:
final_threshold = 40

In [23]:
basenames_to_skip = {
    "万世武鑑-1",
    "享保武鑑-2",
    "享和武鑑-2",
    "元文武鑑-2",
    "元禄武鑑大全-2",
    
    "大成武鑑-2",     # Skip this because of the size; look into how to speed this up
    "天保武鑑-2",     # Same
    "文化武鑑-2",     # Same
    
    "天明武鑑-2",
    "太平武鑑-2",
    "安政武鑑-2",
    
}

In [None]:
for bukan_type, pages_per_scan in count_by_bukan.index:
    basename = f"{bukan_type}-{pages_per_scan}"
    if basename in basenames_to_skip:
        print("Skipping:", basename)
        continue
    print("Processing:", basename)
    subset = data[(data["書名（統一書名）"] == bukan_type) & (data["Pages per Scan"] == pages_per_scan)]
    subset_page_df = build_page_df(subset)
    print("Number of page combinations:", subset_page_df.stack()[subset_page_df.stack() != 0].size)
    desc_dict = read_descriptor_dict(subset.index)
    kps_dict = read_keypoints_dict(subset.index)
    match_time_s, matches_df = applymapi(
        subset_page_df, find_matches, max_distance=100, descriptors=desc_dict, matcher=matcher)
    select_time_s, selected_keypoints_df = applymapi(
        subset_page_df, select_keypoints, matches=matches_df, keypoints=kps_dict, filter=is_near)
    homography_time_s, (homography_df, hmask_df) = compute_homography_and_mask(
        selected_keypoints_df.applymap(itemgetter(0)))
    filter_time, filtered_masks_df = filter_out_bad_homographies(homography_df, hmask_df)
    hmask_sum_df = sum_homography_mask(filtered_masks_df)
    flattened_sums = hmask_sum_df.stack().astype(np.int32)
    thresholded_sums = flattened_sums[flattened_sums > final_threshold]
    relevant_matches_df = pd.concat([
        get_relevant_matches(row_index, nr_matches, filtered_masks_df,
            selected_keypoints_df.applymap(itemgetter(1)),
            matches_df, subset_page_df)
        for row_index, nr_matches in thresholded_sums.items()
    ])
    filename = basename + ".parquet"
    relevant_matches_df.to_parquet(os.path.join("data", filename), engine="pyarrow")
    print("Results written to:", filename)
    total_time = match_time_s + select_time_s + homography_time_s + filter_time
    print(f"Total time: {total_time} seconds ({total_time/60} minutes).")

Skipping: 万世武鑑-1
Skipping: 享保武鑑-2
Skipping: 享和武鑑-2
Skipping: 元文武鑑-2
Skipping: 元禄武鑑大全-2
Skipping: 大成武鑑-2
Skipping: 天保武鑑-2
Processing: 天明武鑑-2
Number of page combinations: 2196429
Results written to: 天明武鑑-2.parquet
Total time: 14027.265999999974 seconds (233.78776666666624 minutes).
Processing: 太平武鑑-2
Number of page combinations: 72249
Results written to: 太平武鑑-2.parquet
Total time: 208.76500000001397 seconds (3.4794166666668995 minutes).
Processing: 安政武鑑-2
Number of page combinations: 70171
Results written to: 安政武鑑-2.parquet
Total time: 1308.234999999986 seconds (21.803916666666435 minutes).
Processing: 安永武鑑-2
Number of page combinations: 1922882
Results written to: 安永武鑑-2.parquet
Total time: 22963.170999999973 seconds (382.7195166666662 minutes).
Processing: 宝暦武鑑-2
Number of page combinations: 552011
Results written to: 宝暦武鑑-2.parquet
Total time: 2060.1720000000205 seconds (34.33620000000034 minutes).
Processing: 宝永武鑑大成-2
Number of page combinations: 3578
Results written to: 宝永武鑑大成-2.par

# Testing around if this even works…

In [17]:
def descriptors_df_to_series(descriptors_df):
    pages = descriptors_df.index.levels[0]
    return pd.Series([descriptors_df.loc[page].values for page in pages], index=pages)
def read_descriptor_dict(book_ids):
    desc_dict = {
        book_id:descriptors_df_to_series(
            pd.read_parquet(f"data/{str(book_id)}/descriptors.parquet", engine="pyarrow"))
        for book_id in book_ids}
    return desc_dict

In [18]:
def keypoints_of_page_to_list(keypoints_of_page):
    return [cv.KeyPoint(*args) for args in zip(*(column for _, column in keypoints_of_page.items()))]
def keypoints_df_to_series(keypoints_df):
    pages = keypoints_df.index.levels[0]
    return pd.Series([keypoints_of_page_to_list(keypoints_df.loc[page]) for page in pages], index=pages)
def read_keypoints_dict(book_ids):
    kps_dict = {
        book_id:keypoints_df_to_series(
            pd.read_parquet(f"data/{str(book_id)}/keypoints.parquet", engine="pyarrow"))
        for book_id in book_ids}
    return kps_dict

In [None]:
matcher = cv.BFMatcher_create(normType=cv.NORM_HAMMING)

In [None]:
match_time_s, matches_df = applymapi(
    subset_page_df, find_matches, max_distance=100, descriptors=desc_dict, matcher=matcher)

In [None]:
wradius = 280.
hradius = 164.
def is_near(pt1, pt2):
    x1, y1 = pt1
    x2, y2 = pt2
    return ((x1 - wradius) <= x2 <= (x1 + wradius)) and ((y1 - hradius) <= y2 <= (y1 + hradius))
select_time_s, selected_keypoints_df = applymapi(
    subset_page_df, select_keypoints, matches=matches_df, keypoints=kps_dict, filter=is_near)

In [None]:
homography_time_s, (homography_df, hmask_df) = compute_homography_and_mask(
        selected_keypoints_df.applymap(itemgetter(0)))

In [None]:
filter_time, filtered_masks_df = filter_out_bad_homographies(homography_df, hmask_df)
hmask_sum_df = sum_homography_mask(filtered_masks_df)

In [19]:
def get_relevant_matches(row_index, nr_matches,
                         filtered_masks_df, selected_keypoints_mask_df, matches_df, page_df):
    book1_id, book2_id, book1_page, book2_offset = row_index
    loc = (book1_id, book2_id, book1_page)
    book2_page = page_df.loc[loc, book2_offset]
    
    filtered_mask = filtered_masks_df.loc[loc, book2_offset]
    selected_keypoints_mask = selected_keypoints_mask_df.loc[loc, book2_offset]
    assert selected_keypoints_mask.sum() == filtered_mask.size
    
    kp_mask = np.empty_like(selected_keypoints_mask, dtype=np.bool)
    counter = 0
    for i, kp_bool in enumerate(selected_keypoints_mask):
        if kp_bool:
            kp_mask[i] = filtered_mask[counter]
            counter += 1
        else:
            kp_mask[i] = False
    assert counter == selected_keypoints_mask.sum()
    
    matches = matches_df.loc[loc, book2_offset]
    assert len(matches) == kp_mask.size
    relevant_matches = [matches[i] for i, val in enumerate(kp_mask) if val]
    assert len(relevant_matches) == kp_mask.sum()
    
    index = pd.MultiIndex.from_product(
        [[book1_id], [book1_page], [book2_id], [book2_page], pd.RangeIndex(len(relevant_matches))],
        names=["book1", "page1", "book2", "page2", "match"])
    
    assert nr_matches == len(relevant_matches)
    return pd.DataFrame([(m.queryIdx, m.trainIdx, m.distance) for m in relevant_matches],
                        columns=["queryIdx", "trainIdx", "distance"], index=index)

In [None]:
flattened_sums = hmask_sum_df.stack().astype(np.int32)
thresholded_sums = flattened_sums[flattened_sums > 40]
relevant_matches_df = pd.concat([
    get_relevant_matches(row_index, nr_matches, filtered_masks_df,
        selected_keypoints_df.applymap(itemgetter(1)),
        matches_df, subset_page_df)
    for row_index, nr_matches in thresholded_sums.items()
])

In [None]:
relevant_matches_df.to_parquet("data/booktitle.parquet", engine="pyarrow")

In [None]:
relevant_matches_df