# Page neighbors metadata

This notebook shows how to perform a basic task: how to gather the metadata for the k most similar images to a given page. The example assumes that you are browsing HathiTrust for a volume within the project scope (one that is likely to have been processed during the project and thus be indexed). You will need access to the index and metadata files from Zenodo:

https://zenodo.org/record/3940528#.XyRNSZ5KjIU

In [166]:
import pandas as pd
import numpy as np
import os, random, re, sys
from annoy import AnnoyIndex
from glob import glob

In [169]:
# the full 17GB index is very fast! Although the time to build it scales linearly
# cf. https://markroxor.github.io/gensim/static/notebooks/annoytutorial.html
u2 = AnnoyIndex(f, 'angular')
u2.load('early-19C-illustrations_full-index.ann')

True

In [170]:
# load the derived metadata summary file (also on Zenodo)
df_meta = pd.read_csv('early-19C-illustrations_metadata.csv')

In [173]:
df_meta.columns

Index(['htid', 'page_seq', 'page_label', 'crop_no', 'vector_path'], dtype='object')

In [174]:
def htid_page_seq_nns_metadata(htid, seq_num, nns_index, df_meta, k):
    """Given a target htid and sequence number, as well as an Annoy index and the project metadata, 
    return metadata for the k neighbors of that htid:page_seq pair."""
    
    # get the index for the page in question
    idx = df_meta[(df_meta['htid'] == htid) & (df_meta['page_seq'] == seq_num)].index
    
    # multiple crops alert
    if len(idx) > 1:
        print("Multiple crops for this page_seq")
        
    # the nearest neighbor ROI indices
    nns = nns_index.get_nns_by_item(idx[0], k)
    
    # return rows from metadat table matching these indices
    return df_meta.iloc[nns]

In [176]:
# Call the method using the htid and sequence number displayed when browsing hathi trust
# For example, this call corresponds to the following URL
# https://babel.hathitrust.org/cgi/pt?id=uiug.30112003448526&view=1up&seq=28
#
# WARNING! the returned dataframe will be empty if the supplied htid + page_seq are not in the project dataset

sample_page_nns = htid_page_seq_nns_metadata('uiug.30112003448526', 28, u2, df_meta, 10)
sample_page_nns

Unnamed: 0,htid,page_seq,page_label,crop_no,vector_path
1617907,uiug.30112003448526,28,inline_image,0,uiug/31042/uiug.30112003448526_00000028_00.npy
254046,ucm.5321309033,347,inline_image,0,ucm/5193/ucm.5321309033_00000347_00.npy
1613603,uiug.30112048888058,326,inline_image,0,uiug/31485/uiug.30112048888058_00000326_00.npy
1269805,chi.097881099,75,inline_image,0,chi/080/chi.097881099_00000075_00.npy
1844016,uc1.c046857802,166,inline_image,1,uc1/c672/uc1.c046857802_00000166_01.npy
1341309,chi.79355181,551,inline_image,0,chi/758/chi.79355181_00000551_00.npy
1724782,uc1.$b557159,190,plate_image,0,uc1/$55/uc1.$b557159_00000190_00.npy
1407896,njp.32101063578338,46,plate_image,0,njp/30673/njp.32101063578338_00000046_00.npy
2438718,hvd.hn5cxz,19,inline_image,0,hvd/hc/hvd.hn5cxz_00000019_00.npy
1454741,njp.32101080155110,80,plate_image,0,njp/30851/njp.32101080155110_00000080_00.npy
