# Extracted Catalogue Entry Analysis

Analyse catalogue entries extracted by main.py or extract_catalogue_entries.ipynb.

In [None]:
import sys
if "../" not in sys.path:
    sys.path.append("../")
import glob
import re
import os
import pickle
from IPython.display import display
from PIL import Image, ImageDraw
from functools import partial
import pandas as pd
import requests
import matplotlib.pyplot as plt
from matplotlib import colormaps
from cycler import cycler
import src.data.xml_extraction as xmle
from langdetect import detect, LangDetectException
from tqdm import tqdm
tqdm.pandas()

In [None]:
def reconstruct_word_coords(s):
    line_re = re.compile(r"\[\[.*?\]\]")
    word_re = re.compile(r"\[.*?\]")
    num_re = re.compile(r"[\d]*")
    raw_lines = line_re.findall(s[1:-1])
    split_word_lines = [word_re.findall(l[1:-1]) for l in raw_lines]
    lines = []
    for sw_line in split_word_lines:
        line = [[int(x) for x in num_re.findall(word) if x] for word in sw_line]
        lines.append(line) 
    return lines

def reconstruct_en_entry(s):
    return " ".join(s[2:-2].split("', '"))

def reconstruct_xmls(s):
    return s[1:-1].replace(" ", "").replace("'", "").split(",")
        
def reconstruct_xml_start_line(s):
    return [int(loc) for loc in s[1:-1].replace(" ", "").split(",")]

converters={"word_locations":reconstruct_word_coords, "en_only":reconstruct_en_entry, "xmls": reconstruct_xmls, "xml_start_line": reconstruct_xml_start_line}

In [None]:
# not using while only working w vol 3

# csv_path = r"..\data\processed\BMC_[0-9]*\catalogue_entries*.csv"
# entry_csv_paths = glob.glob(csv_path)

# entry_csvs = {p.split("\\")[-3]: pd.read_csv(p, converters={"entry": lambda x: x[2:-2].split("\', \'")}) for p in entry_csv_paths}

# for vol, df in entry_csvs.items():
#     df["vol"] = int(vol.split("_")[-1])

# entry_csv_paths

In [None]:
# entry_no_caps_df = pd.concat(list(entry_csvs.values())).rename_axis(index="volume_entry_num").reset_index()

In [None]:
# entry_df = pd.concat(list(entry_csvs.values())).rename_axis(index="volume_entry_num").reset_index().sort_values(by=["vol", "volume_entry_num"])
entry_df = pd.read_csv("..\\data\\processed\\BMC_3\\catalogue_entries_leading_caps.csv")
entry_sm_df = pd.read_csv("..\\data\\processed\\BMC_3\\catalogue_entries_bought_in.csv")

In [None]:
entry_df

In [None]:
entry_sm_df.loc[1, "entry_text"]

In [None]:
entry_sm_df

In [None]:
# all the xml ids present on tkb - if the code didn't find a shelfmark in an xml it won't be represented in matched_records
# use the complete list to convert our incomplete list into page numbers on tkb
with open("..\\data\\interim\\all_xml_ids.txt", "r") as f:
    xml_ids = [x.strip("\n") for x in f.readlines()]

In [None]:
all_xml_df = pd.DataFrame({"xml":xml_ids})

all_xml_df["vol"] = all_xml_df["xml"].str.split("_").apply(lambda x: int(x[4]))
all_xml_df["raw_tkb_page"] = all_xml_df["xml"].str.split("_").apply(lambda x: int(x[5]))
all_xml_df["num_cols"] = all_xml_df["xml"].str.split("_").apply(lambda x: int(x[6]))
all_xml_df.sort_values(by=["vol", "num_cols", "raw_tkb_page"], inplace=True)
all_xml_df["tkb_page"] = all_xml_df.groupby(by=["vol", "num_cols"])["raw_tkb_page"].transform(lambda x: (x.diff() > 0).cumsum() + 1)
all_xml_df = all_xml_df.set_index("xml")

In [None]:
entry_tkb_df = entry_df.set_index("xml").merge(right=all_xml_df.drop(columns="raw_tkb_page"), how="left", on="xml").sort_values(["vol", "num_cols", "vol_entry_num"])

### Trial regexes on all BLL shelfmarks

In [None]:
bll01_index_df = pd.read_csv("..\\data\\processed\\bll01_index.csv", encoding='latin-1', dtype={"c_sm": bool, "ig_sm": bool, "uncaptured_sm": bool})
bll01_index_df.rename(columns={'British Library shelfmark (852 $j)': "bll01_shelfmark", 'Record IDs (001)': "record_id"}, inplace=True)
# merge_df = entry_tkb_df.merge(right=bll01_index_df, how="left", left_on="shelfmark", right_on="bll01_shelfmark")

In [None]:
c_re = re.compile(r"""
 (?<![A-Za-z0-9\n\-\u201C.])   # Ensure no writing precedes re
 (?<=[( ])                     # Preceded by space or ( 
 C                             # C of a King Charles lib sm
 ([.,][ ]?[a-z0-9-*]+)+        # 1+ repeats of [\.,] ?[a-z0-9-]+ i.e. the characters in the sm
 ([. ]*[(][0-9. ]*[)])?        # allow followed by a bracketed sets of numbers
 (?=[., )]|\Z)                 # lookahead for [\.,][ )] or end of string
""", re.VERBOSE)

c_re = re.compile(r"""
 (?<![A-Za-z0-9\n\-\u201C.])   # Ensure no writing precedes re
 (?<=[( ])                     # Preceded by space or ( 
 C                             # C of a King Charles lib sm
 \.[ ]?[0-9]+                  # stop, optional space, 1+ number
 \.[ ]?[a-z]+                  # stop, optional space, 1+ letter
 ([.,][ ]?[0-9-*]+)+           # 1+ (stop/comma, optional space, 1+ number)  
 ([. ]*[(][0-9. ]+[)])?        # allow followed by a bracketed sets of numbers
 (?=[ )]|\Z)                   # lookahead for [\.,][ )] or end of string
""", re.VERBOSE)

ig_re = re.compile(r"""
 (?<![A-Za-z0-9\n\-\u201C.])   # Ensure no writing precedes re, effectively only allow a space
 (I[ABC]|G)                    # Start chars for procter number of Grenville sm
 ([.,][ ]?[a-z0-9-/A]+)+       # 1+ (stop/comma, optional space, 1+ alphanumeric or dash/slash)
 \**                           # allow trailing *
 ([. ]*[(][0-9-., ]+[)])?        # allow followed by a bracketed sets of numbers
 (?=[.,) ]|\Z)                   # lookahead for [\.,][ )] or end of string
""", re.VERBOSE)

In [None]:
ig_re.search('G. 7726. (1. ) ; G. 7726. (2. )')

In [None]:
bll01_index_df["c_re"] = bll01_index_df["bll01_shelfmark"].apply(lambda x: c_re.search("(" + x))
bll01_index_df["ig_re"] = bll01_index_df["bll01_shelfmark"].apply(lambda x: ig_re.search(x))

In [None]:
bll01_index_df.head()

In [None]:
bll01_index_df[bll01_index_df["c_sm"]].iloc[50:100]

In [None]:
bll01_index_df[~bll01_index_df["c_re"].isna()]

In [None]:
len(set(bll01_index_df["ig_re"].dropna().apply(lambda x:x.group())) ^ set(bll01_index_df["bll01_shelfmark"][bll01_index_df["ig_sm"]]))

Styles of unmatched shelfmark
- \[number]. \[letter]*. \[number]
- \[number]/\[number]
- Acc|ad|\[name]\[. ] \[number]\[. ]\[number]

### Combined and split text for Rossitza's AntConc work

#### Remove non-english entries

In [None]:
def detect_en(s):
    try:
        return detect(s) == "en"
    except LangDetectException:
        return False

def get_english_sections(entry):
    title = entry[0:1]
    entry = pd.Series(entry)
    langs = entry.apply(lambda x: detect_en(x))
    english_sections = langs.rolling(window=2, closed='both', center=False).mean().bfill() > 0.6
    english_only = entry[english_sections].to_list()
    if not english_only:
        return [""]
    if english_only[0] != title:
        english_only = title + english_only
    return english_only

In [None]:
for i in range(1,2):
    entry_df = pd.read_csv(f"..\\data\\processed\\BMC_{i}\\catalogue_entries.csv", converters={"word_locations":reconstruct_word_coords})
    entry_df["entry"] = entry_df["entry_text"].str.split("\n")

    try:
        print(f"Vol {i}")
        entry_df["en_only"] = entry_df["entry"].progress_apply(lambda x: get_english_sections(x))
        entry_df.to_csv(f"..\\data\\processed\\BMC_{i}\\catalogue_entries_en_only.csv")
    except IndexError:
        print(f"{i} failed")
        continue

In [None]:
for i in range(1,2):
    entry_df = pd.read_csv(f"..\\data\\processed\\BMC_{i}\\catalogue_entries_en_only.csv", converters={"word_locations":reconstruct_word_coords, "en_only":reconstruct_en_entry})

    try:
        print(f"Vol {i}")
        with open(f"..\\data\\processed\\BMC_{i}\\BMC_{i}_split_text_single_line_v1.2.txt", "w", encoding="utf-8") as f:
            trailing_linebreak_only = entry_df["en_only"] + "\n"
            export_text = trailing_linebreak_only.sum()
            f.write(export_text)
    except IndexError:
        print(f"{i} failed")
        continue

In [None]:
combined_text = "..\\data\\processed\\BMC_1_10_combined_split_text_single_line_v1.2.txt"
for i in range(1,10):
    print(f"Vol {i}")
    vol_text = f"..\\data\\processed\\BMC_{i}\\BMC_{i}_split_text_single_line_v1.2.txt" 
    with open(vol_text, "r", encoding="utf-8") as f, open(combined_text, "a", encoding="utf-8") as g:
        vol_lines = f.read()
        g.write(vol_lines)

### Work with extracted entries

In [None]:
entry_df = pd.read_csv("..\\data\\processed\\BMC_1\\catalogue_entries_complete_xmls.csv", converters=converters)

In [None]:
entry_df.head()

In [None]:
def split_word_locs(row):
    if len(row["xml_start_line"]) == 1:
        return [row["word_locations"]]
    else:
        return [row["word_locations"][start:end] for start, end in zip([0] + row["xml_start_line"], row["xml_start_line"])]

In [None]:
def gen_page_entries_lookup(df):
    xml_list = df["xmls"].sum()
    word_locs_split = df.apply(split_word_locs, axis=1).sum()

    page_entries_df = pd.DataFrame(data={"xml":xml_list, "word_locs":word_locs_split})
    page_entries_df["word_locs"] = page_entries_df["word_locs"].apply(lambda x: [x])

    page_entries_lookup = page_entries_df.groupby(by="xml", as_index=False).sum()
    pages_non_zeroed = page_entries_lookup["xml"].apply(lambda x: int(x.split("_")[-2]))
    page_entries_lookup["page"] = (pages_non_zeroed - (pages_non_zeroed.min() - 1)).values
    page_entries_lookup["n_entries"] = page_entries_lookup["word_locs"].apply(len)
    return page_entries_lookup.set_index("page")

In [None]:
page_entries_lookup = gen_page_entries_lookup(entry_df)
page_entries_lookup

In [None]:
def get_concat_h(ims):
    # all ims are not exactly same size - differ by maybe 10%
    widths = [im.width for im in ims]
    cumsum_width = [0] + [sum(widths[:i+1]) for i in range(len(widths))]
    total_width = sum([im.width for im in ims])
    dst = Image.new('RGBA', (total_width, ims[0].height))
    [dst.paste(im, (x_start, 0)) for im, x_start in zip(ims, cumsum_width[:-1])]
    return dst

In [None]:
def display_entry(df_row):
    xmls = df_row["xmls"]
    start_lines = df_row["xml_start_line"]
    vols = [xml.split("_")[-3] for xml in xmls]
    cols = [xml[-1] for xml in xmls]
    jpgs = [xml[:-2] for xml in xmls]
    image_paths = [glob.glob(f"..\\data\\raw\\BMC_{vol}_{col}\\*\\{jpg}.jpg")[0] for vol, col, jpg in zip(vols, cols, jpgs)]

    # get an image
    word_locs = df_row["word_locations"]
    out_images = []
    for path, start, cutoff in zip(image_paths, [0] + start_lines, start_lines):
        with Image.open(path).convert("RGBA") as base:
        
            # make a blank image for the colour patches, initialized to transparent
            patches = Image.new("RGBA", base.size, (255, 255, 255, 0))
            
            draw = ImageDraw.Draw(patches)
            for line in word_locs[start:cutoff]:
                [draw.rectangle(word, fill=(237, 232, 74, 65)) for word in line]
        
            out = Image.alpha_composite(base, patches)
            
            width, height = base.width // 6, base.height // 6
            out_images.append(out.resize((width, height)))

    concat_out = get_concat_h(out_images)
    display(concat_out)


colours = colormaps["Accent"].colors[:4]
colours = [[int(255 * x) for x in list(c)] + [65] for c in colours]
pastel_cycler = cycler(color=colours)


def display_page(page, page_entry_lookup):
    xml = page_entry_lookup.loc[page, "xml"]
    entries = page_entry_lookup.loc[page, "word_locs"]
    cc = pastel_cycler()
    colours = [c['color'] for c, _ in zip(cc, entries)]
    vol = xml.split("_")[-3]
    col = xml[-1]
    jpg = xml[:-2]
    path = glob.glob(f"..\\data\\raw\\BMC_{vol}_{col}\\*\\{jpg}.jpg")[0]

    with Image.open(path).convert("RGBA") as base:
        
        # make a blank image for the colour patches, initialized to transparent
        patches = Image.new("RGBA", base.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(patches)
        
        for word_locs, colour in zip(entries, colours):
            for line in word_locs:
                [draw.rectangle(word, fill=tuple(colour)) for word in line]
        
            out = Image.alpha_composite(base, patches)
            
        width, height = base.width // 8, base.height // 8
        resized = out.resize((width, height))
        display(resized)
        return resized

In [None]:
page_4, page_5 = display_page(4, page_entries_lookup), display_page(5, page_entries_lookup)

In [None]:
get_concat_h([page_4, page_5]).save("..\\reports\\figures\\categorised_spread.png")

In [None]:
display_entry(entry_df.loc[11])

## Entry length

In [None]:
entry_df["entry_length"] = entry_df["entry_text"].transform(lambda x: len(x))

In [None]:
entry_df.head()

In [None]:
ma = entry_df["entry_length"].rolling(window=100, center=True).mean()
mean = entry_df.groupby(by="vol")["entry_length"].mean()

In [None]:
mean

In [None]:
# mean.rename_axis("Volume").rename("Mean Entry Length").to_csv("..\\data\\processed\\mean_lengths.csv")

In [None]:
n_entrys = entry_df.groupby(by="vol")["vol"].count()
n_entrys.loc[0] = 0
n_entrys.sort_index(inplace=True)
x_locs = n_entrys.cumsum() - n_entrys.cumsum().diff()/2

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(entry_df["entry_length"], lw=1)
ax.plot(ma, "black", label="Moving average")
ax.set_title("Catalogue Entry Length For Incunabula Volumes 1-10", fontsize='x-large')
ax.set_xlabel("Catalogue Entry Number (across all volumes)", fontsize='x-large')
ax.set_ylabel("Entry length (characters)", fontsize='x-large')
ax.tick_params(labelsize='large')
ax.vlines(n_entrys.cumsum(), 0, ax.get_ylim()[1], colors="black", linestyles="--")
ax.set_xlim(0, len(entry_df))
ax.set_ylim(0, entry_df["entry_length"].max() + 100)
for i, x in enumerate(x_locs.dropna()[:8]):
    ax.text(x, 10600, f"BMC {i+1}\n$\mu$: {mean.loc[i+1]:.0f}", ha="center")
    

ax.text(x_locs[9], 10600, f"BMC {9}", rotation="vertical", ha="center")
ax.text(x_locs[10], 10600, f"BMC {10}", rotation="vertical", ha="center")
ax.text(x_locs[9], 9100, f"$\mu$: {mean.loc[9]:.0f}", rotation="vertical", ha="center")
ax.text(x_locs[10], 9100, f"$\mu$: {mean.loc[10]:.0f}", rotation="vertical", ha="center")
ax.legend()

Vol 5 and 8 were catalogued by the same person, so poss more errors here or actually reflecting cataloguing style

In [None]:
entry_df.query("entry_length > 6000")

In [None]:
# fig.savefig("..\\reports\\figures\\entry_length.png", dpi=300, bbox_inches="tight")

## Another copy

In [None]:
def extract_another_copy(row):
    """

    :return:
    """
    another_variants = [
        'Another cancelled',
        'A cancelled',
        'Another copy',
        'Another edition',
        'Another fragment,',
        'Another issue'
    ]
    
    match = []
    for v in another_variants:
        p = re.compile(v)
        m = p.finditer(row)
        if m:
            match += m
    
    if match:
        return match
    else:
        return None

In [None]:
# method to find valid variants of another copy
"""
copy_re = re.compile("Another \S*")
anothers = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
copy_variants = sorted(list(set(anothers.apply(lambda x: x.group() if x else None).dropna())))
copy_variants

entry_df["match"] = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
entry_df["preceding_shelfmark"] = entry_df.apply(check_for_leading_shelfmark, axis=1)

x = 2
print(copy_variants[x])
entry_df[entry_df["entry_text"].str.contains(copy_variants[x])]
"""

All the matches of the "Another \S*" regex with statement as to whether consists of actual 'Another edition' information.


'Another (crown':  Not valid, referring to watermarks in the text  
'Another calendar':  Not valid, referring to calendars in the work  
'Another calligraphic':  Not valid, referring to calligraphic letters  
'Another cancelled':  Valid, has it's own Proctor # and copy specific info. There's also a copy before this that's just "A cancelled copy", but there's only one occurence of this.  
'A cancelled': Valid, see above entry.  
'Another closely':  Not valid, describes another edition that's similar  
'Another compartment':  Not valid, part of the information rather than about another copy  
'Another copy':  Valid  
'Another copy,':  Subset of Another copy  
'Another copy.':  Subset of Another copy  
'Another cut':  Not valid  
'Another edition':  Valid  
'Another edition,':  Subset  
'Another edition.':  Subset  
'Another fragment,':  Valid  
'Another full-page':  Not valid  
'Another issue':  Valid  
'Another issue,':  Subset  
'Another issue.':  Subset  
'Another metrical':  Not valid  
'Another reading.':  Not valid  
'Another recension':  Not valid  
'Another setting':  Not valid  
'Another setting-up':  Subset  
'Another version:  Not valid  

In [None]:
another_variants = [
    'Another cancelled',
    'A cancelled',
    'Another copy',
    'Another edition',
    'Another fragment,',
    'Another issue'
]

Having a leading shelfmark is highly indicative of an 'Another copy' entry actually being another copy. Of course this relies on the shelfmark detection being accurate. In some cases this isn't so, see analysis below for efforts to improve Issac's shelfmark finding.

In [None]:
# TODO some of the shelfmarks are absent - instead the Another copy has it's location listed as "Print room"
# work out what to do with this

def check_for_leading_shelfmark(row, match_col, find_valid=True):
    shelfmark = False
    valid_matches = []
    if row[match_col]:
        valid = [xmle.find_title_shelfmark(row["entry_text"][match.span()[0]-100: match.span()[1]]) for match in row[match_col]]
        if not find_valid:
            valid = [not v for v in valid]
        valid_matches = [m for m, v in zip(row[match_col], valid) if v] 
    if valid_matches:
        return valid_matches
    else:
        return None

In [None]:
caps_regex = re.compile("[A-Z][A-Z](?!I)[A-Z]+")

In [None]:
entry_df["other_copies"] = entry_df["entry_text"].apply(lambda x: extract_another_copy(x))
entry_df["valid_copies"] = entry_df.apply(check_for_leading_shelfmark, match_col="other_copies", axis=1)
entry_df["bad_copies"] = entry_df.apply(check_for_leading_shelfmark, match_col="other_copies", find_valid=False, axis=1)
entry_df["leading_caps"] = entry_df["entry_text"].apply(lambda x: caps_regex.match(x))

In [None]:
idx = 4
entry_df.loc[idx, "entry_text"]

In [None]:
caps_regex.search(entry_df.loc[idx, "entry_text"])

In [None]:
entry_df.loc[idx]

In [None]:
entry_df.loc[idx, "entry_text"]

In [None]:
iiif_vol_manifests = {
    1: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100187977347.0x000001/manifest.json", 
    2: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186434238.0x000001/manifest.json", 
    3: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186508207.0x000001/manifest.json", 
    4: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186435804.0x000001/manifest.json", 
    5: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186440144.0x000001/manifest.json", 
    6: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186441763.0x000001/manifest.json", 
    7: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186508670.0x000001/manifest.json", 
    8: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100186436306.0x000001/manifest.json", 
    9: "http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100187984642.0x000001/manifest.json", 
    10:"http://api.bl.uk/metadata/iiif/ark:/81055/vdc_100187985363.0x000001/manifest.json" 
}

In [None]:
# image_url_stems = []
# thumbnail_pattern = re.compile("https://api\.bl\.uk/image/iiif/ark:/81055/vdc_\d*")
# for url in iiif_vol_manifests.values():
#     manifest = requests.get(url)
#     image_url_stems.append(thumbnail_pattern.search(manifest.text).group())

In [None]:
iiif_image_url_stems = {
    1: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188432072.0x',
    2: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188433159.0x',
    3: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188433874.0x',
    4: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188438804.0x',
    5: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188433623.0x',
    6: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188434221.0x',
    7: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188432690.0x',
    8: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188432452.0x',
    9: 'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188432911.0x',
    10:'https://api.bl.uk/image/iiif/ark:/81055/vdc_100188439091.0x'
}

In [None]:
image_api_options = "/full/800,/0/default.jpg"

In [None]:
def iiif_image_url(iiif_vol_url, page, image_api_options):
    hex_page = f'{int(page):06x}'
    image_url = f"{iiif_vol_url}{hex_page.rjust(6, '0')}{image_api_options}"
    return image_url

In [None]:
vol = entry_df.loc[idx, "vol"]
page = entry_df.loc[idx, "xml"].split("_")[-2]
image_url = iiif_image_url(iiif_image_url_stems[vol], page, image_api_options)
display(Image(image_url))

In [None]:
image_url = iiif_image_url(iiif_image_url_stems[vol], int(page) + 1, image_api_options)
display(Image(image_url))

### Assess regex variants

Some of the 'another copy' leading shelfmarks aren't being picked up. Improve the original shelfmark detection, particularly C numbers (which are sometimes '1' numbers)

In [None]:
caps_regex = re.compile("[A-Z]{3,}")
# c_num_regex = re.compile("[^I]C\\.[0-9]")  # C number title references
# c_num_space_regex = re.compile("[^I]C\\.[ ]?[0-9]")  # C number title references
c_num_regex = re.compile("[^A-Za-z0-9\\n\.\-\u201C]C\\.[ ]?[0-9]")  # C number title references
c_date_regex = re.compile("[^I]C\\.[ \t\r\f\v]?1[0-9]{3}[^0-9]")  # accidental date references
one_num_regex = re.compile("1\\.\\s[a-z]")
g_num_regex = re.compile("G.[ ]?[0-9]")
i_num_regex = re.compile(r"[I1][ABC]\\.[ ]?[0-9]")  # I number title references
date_regex = re.compile("1[45][0-9][0-9]")

In [None]:
c_nums = entry_df["entry_text"].apply(lambda x: c_num_regex.finditer(x)).apply(lambda x: [x for x in x]).apply(lambda x: x if len(x) > 0 else None)
c_dates = entry_df["entry_text"].apply(lambda x: c_date_regex.finditer(x)).apply(lambda x: [x for x in x]).apply(lambda x: x if len(x) > 0 else None)

entry_df["c_nums"] = c_nums
entry_df["c_dates"] = c_dates

In [None]:
def exclude_date_matches(row):
    if row["c_dates"] and row["c_nums"]:
        for r in row["c_dates"]:
            date_span = set(range(*r.span()))
            accidental_date = [date_span.intersection(set(range(*x.span()))) for x in row["c_nums"]]
            
        clean_cnums = [x for x,y in zip(row["c_nums"], accidental_date) if not y]
        if clean_cnums:
            return clean_cnums
        else:
            return row["c_nums"]
    else:
        return row["c_nums"]

In [None]:
entry_df["clean_c_nums"] = entry_df.apply(exclude_date_matches, axis=1)

In [None]:
# pred_idx = entry_df["clean_cnums"].dropna().apply(lambda x: [x.span() for x in x]).index.difference(entry_df["let_cnums"].dropna().apply(lambda x: [x.span() for x in x]).index)

In [None]:
def find_matches(row, match_row):
    matches = []
    if not row[match_row]:
        return None
    for match in row[match_row]:
        matches.append(row["entry_text"][match.span()[0]:match.span()[1] + 20])
    return matches

In [None]:
i = 0
def twenty_plus_gen():
    global i
    i += 10
    return i

In [None]:
entry_df.apply(find_matches, match_row="clean_c_nums", axis=1).dropna().iloc[twenty_plus_gen()-10: twenty_plus_gen()]

In [None]:
idx = 4767
entry_df.loc[idx, "entry_text"]

In [None]:
vol = entry_df.loc[idx, "vol"]
page = entry_df.loc[idx, "xml"].split("_")[-2]
image_url = iiif_image_url(iiif_image_url_stems[vol], page, image_api_options)
display(Image(image_url))

### Outdated image loading from network drive rather than IIIF manifest

In [None]:
vol = entry_df.loc[idx, "vol"]
col = int(entry_df.loc[idx, "xml"][-1])
jpg = entry_df.loc[idx, "xml"][:-2]
image_path = glob.glob(
    r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
    f"\\BMC_{vol} {col} column pages Transkribus export"
    f"\\*\\*\\*{jpg}.jpg"
)[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=image_path))
        break
    except:
        attempts += 1

In [None]:
try:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]
    
except IndexError:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    if int(col) == 2:
        next_col = 4
    elif int(col) == 4:
        next_col = 2
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {next_col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=next_image_path))
        break
    except:
        attempts += 1