# Extracted Catalogue Entry Analysis

Analyse catalogue entries extracted by main.py or extract_catalogue_entries.ipynb.

In [None]:
import sys
if "../" not in sys.path:
    sys.path.append("../")
import glob
import re
import os
import pickle
from IPython.display import Image, display
import pandas as pd
import matplotlib.pyplot as plt
import src.data.xml_extraction as xmle
from tqdm import tqdm
tqdm.pandas()

In [None]:
csv_path = r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula\split_data\BMC_[0-9]*\catalogue_entries.csv"
entry_csv_paths = glob.glob(csv_path)

In [None]:
entry_csvs = {p.split("\\")[-2]: pd.read_csv(p, converters={"entry": lambda x: x[2:-2].split("\', \'")}) for p in entry_csv_paths}

In [None]:
for vol, df in entry_csvs.items():
    df["vol"] = int(vol.split("_")[-1])

In [None]:
# entry_no_caps_df = pd.concat(list(entry_csvs.values())).rename_axis(index="volume_entry_num").reset_index()

In [None]:
entry_df = pd.concat(list(entry_csvs.values())).rename_axis(index="volume_entry_num").reset_index()

In [None]:
entry_df.shape

In [None]:
root = r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula\split_data"

In [None]:
# for x in range(1,11):
#     old = os.path.join(root, f"BMC_{x}\combinedrawtext_single_line_old_process.txt")
#     new = os.path.join(root, f"BMC_{x}\combinedrawtext_single_line_v1.0.txt")
#     os.rename(old, new)

In [None]:
entry_df.head()

In [None]:
# entry_df.to_csv(os.path.join(root, "all_volume_catalogue_entries.csv"))

In [None]:
# for x in range(1,11):
#     out_name = os.path.join(root, f"BMC_{x}\combinedrawtext_single_line_v1.1.txt")
#     vol_df = entry_df.query("vol == @x")
#     with open(out_name, "w", encoding="utf-8") as f:
#         grpby = vol_df.groupby(by="volume_entry_num")
#         for grp in grpby:
#             f.write(grp[1]["entry_text"].values[0].replace("\n", " ") + "\n")

In [None]:
# split = entry_df["entry"].progress_apply(lambda x: xmle.split_by_language(x))

In [None]:
# with open("split.p", "wb") as f:
#     pickle.dump(split, f)

In [None]:
entry_df["split_text"] = split

In [None]:
entry_df["split_text"]

In [None]:
test = entry_df.query("vol == 1").groupby(by="volume_entry_num")["split_text"]

In [None]:
for x in range(1,11):
    out_name = os.path.join(root, f"BMC_{x}\combinedsplittext_single_line_v1.1.txt")
    vol_df = entry_df.query("vol == @x")

    grpby = vol_df.groupby(by="volume_entry_num")["split_text"]
    
    vol_split_txt = ""
    
    for grp in grpby:
        entry_txt = ""
        language_en = grp[1].values[0][0]
        lang_grpd_entry_lines = grp[1].values[0][1]
        start = 0 + int(not language_en)
        for lang_grp in lang_grpd_entry_lines[start::2]:
            for line in lang_grp:
                entry_txt += line.replace("\n", " ").replace('"', "").replace("'", "") + " "
        vol_split_txt += entry_txt + "\n"

    with open(out_name, "w", encoding="utf-8") as f:
        f.write(vol_split_txt)

In [None]:
entry_no_caps_df.shape

## Entry length

In [None]:
entry_df.loc[0, "entry_text"]

In [None]:
len(entry_df.loc[0, "entry_text"])

In [None]:
entry_df["entry_length"] = entry_df["entry_text"].transform(lambda x: len(x))

In [None]:
entry_df.loc[0, "entry_length"]

In [None]:
entry_df.head()

In [None]:
ma = entry_df["entry_length"].rolling(window=100, center=True).mean()
mean = entry_df.groupby(by="vol")["entry_length"].mean()

In [None]:
mean

In [None]:
# mean.rename_axis("Volume").rename("Mean Entry Length").to_csv("..\\data\\processed\\mean_lengths.csv")

In [None]:
n_entrys = entry_df.groupby(by="vol")["vol"].count()
n_entrys.loc[0] = 0
n_entrys.sort_index(inplace=True)
x_locs = n_entrys.cumsum() - n_entrys.cumsum().diff()/2

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(entry_df["entry_length"], lw=1)
ax.plot(ma, "black", label="Moving average")
ax.set_title("Catalogue Entry Length For Incunabula Volumes 1-10", fontsize='x-large')
ax.set_xlabel("Catalogue Entry Number (across all volumes)", fontsize='x-large')
ax.set_ylabel("Entry length (characters)", fontsize='x-large')
ax.tick_params(labelsize='large')
ax.vlines(n_entrys.cumsum(), 0, ax.get_ylim()[1], colors="black", linestyles="--")
ax.set_xlim(0, len(entry_df))
ax.set_ylim(0, entry_df["entry_length"].max() + 100)
for i, x in enumerate(x_locs.dropna()[:8]):
    ax.text(x, 10600, f"BMC {i+1}\n$\mu$: {mean.loc[i+1]:.0f}", ha="center")
    

ax.text(x_locs[9], 10600, f"BMC {9}", rotation="vertical", ha="center")
ax.text(x_locs[10], 10600, f"BMC {10}", rotation="vertical", ha="center")
ax.text(x_locs[9], 9100, f"$\mu$: {mean.loc[9]:.0f}", rotation="vertical", ha="center")
ax.text(x_locs[10], 9100, f"$\mu$: {mean.loc[10]:.0f}", rotation="vertical", ha="center")
ax.legend()

Vol 5 and 8 were catalogued by the same person, so poss more errors here or actually reflecting cataloguing style

In [None]:
entry_df.query("entry_length > 6000")

In [None]:
# fig.savefig("..\\reports\\figures\\entry_length.png", dpi=300, bbox_inches="tight")

## Another copy

In [None]:
def extract_another_copy(row):
    """

    :return:
    """
    another_variants = [
        'Another cancelled',
        'A cancelled',
        'Another copy',
        'Another edition',
        'Another fragment,',
        'Another issue'
    ]
    
    match = []
    for v in another_variants:
        p = re.compile(v)
        m = p.finditer(row)
        if m:
            match += m
    
    if match:
        return match
    else:
        return None

In [None]:
# method to find valid variants of another copy
"""
copy_re = re.compile("Another \S*")
anothers = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
copy_variants = sorted(list(set(anothers.apply(lambda x: x.group() if x else None).dropna())))
copy_variants

entry_df["match"] = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
entry_df["preceding_shelfmark"] = entry_df.apply(check_for_leading_shelfmark, axis=1)

x = 2
print(copy_variants[x])
entry_df[entry_df["entry_text"].str.contains(copy_variants[x])]
"""

All the matches of the "Another \S*" regex with statement as to whether consists of actual 'Another edition' information.


'Another (crown':  Not valid, referring to watermarks in the text  
'Another calendar':  Not valid, referring to calendars in the work  
'Another calligraphic':  Not valid, referring to calligraphic letters  
'Another cancelled':  Valid, has it's own Proctor # and copy specific info. There's also a copy before this that's just "A cancelled copy", but there's only one occurence of this.  
'A cancelled': Valid, see above entry.  
'Another closely':  Not valid, describes another edition that's similar  
'Another compartment':  Not valid, part of the information rather than about another copy  
'Another copy':  Valid  
'Another copy,':  Subset of Another copy  
'Another copy.':  Subset of Another copy  
'Another cut':  Not valid  
'Another edition':  Valid  
'Another edition,':  Subset  
'Another edition.':  Subset  
'Another fragment,':  Valid  
'Another full-page':  Not valid  
'Another issue':  Valid  
'Another issue,':  Subset  
'Another issue.':  Subset  
'Another metrical':  Not valid  
'Another reading.':  Not valid  
'Another recension':  Not valid  
'Another setting':  Not valid  
'Another setting-up':  Subset  
'Another version:  Not valid  

In [None]:
another_variants = [
    'Another cancelled',
    'A cancelled',
    'Another copy',
    'Another edition',
    'Another fragment,',
    'Another issue'
]

Having a leading shelfmark is highly indicative of an 'Another copy' entry actually being another copy. Of course this relies on the shelfmark detection being accurate. In some cases this isn't so, see analysis below for efforts to improve Issac's shelfmark finding.

In [None]:
# TODO some of the shelfmarks are absent - instead the Another copy has it's location listed as "Print room"
# work out what to do with this

def check_for_leading_shelfmark(row, match_col, find_valid=True):
    shelfmark = False
    valid_matches = []
    if row[match_col]:
        valid = [xmle.find_title_shelfmark(row["entry_text"][match.span()[0]-100: match.span()[1]]) for match in row[match_col]]
        if not find_valid:
            valid = [not v for v in valid]
        valid_matches = [m for m, v in zip(row[match_col], valid) if v] 
    if valid_matches:
        return valid_matches
    else:
        return None

In [None]:
entry_df["other_copies"] = entry_df["entry_text"].apply(lambda x: extract_another_copy(x))
entry_df["valid_copies"] = entry_df.apply(check_for_leading_shelfmark, match_col="other_copies", axis=1)
entry_df["bad_copies"] = entry_df.apply(check_for_leading_shelfmark, match_col="other_copies", find_valid=False, axis=1)
entry_df["leading_caps"] = entry_df["entry_text"].apply(lambda x: caps_regex.match(x))

In [None]:
caps_regex = re.compile("[A-Z][A-Z](?!I)[A-Z]+")

In [None]:
idx = 4
entry_no_caps_df.loc[idx, "entry_text"]

In [None]:
entry_df.loc[idx, "entry_text"]

In [None]:
caps_regex.search(entry_df.loc[idx, "entry_text"])

In [None]:
entry_df.loc[idx]

In [None]:
entry_df.loc[idx, "entry_text"]

In [None]:
vol = entry_df.loc[idx, "vol"]
col = int(entry_df.loc[idx, "xml"][-1])
jpg = entry_df.loc[idx, "xml"][:-2]
image_path = glob.glob(
    r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
    f"\\BMC_{vol} {col} column pages Transkribus export"
    f"\\*\\*\\*{jpg}.jpg"
)[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=image_path))
        break
    except:
        attempts += 1

In [None]:
try:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]
    
except IndexError:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    if int(col) == 2:
        next_col = 4
    elif int(col) == 4:
        next_col = 2
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {next_col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=next_image_path))
        break
    except:
        attempts += 1

### Assess regex variants

Some of the 'another copy' leading shelfmarks aren't being picked up. Improve the original shelfmark detection, particularly C numbers (which are sometimes '1' numbers)

In [None]:
caps_regex = re.compile("[A-Z]{3,}")
# c_num_regex = re.compile("[^I]C\\.[0-9]")  # C number title references
# c_num_space_regex = re.compile("[^I]C\\.[ ]?[0-9]")  # C number title references
c_num_regex = re.compile("[^A-Za-z0-9\\n\.\-\u201C]C\\.[ ]?[0-9]")  # C number title references
c_date_regex = re.compile("[^I]C\\.[ \t\r\f\v]?1[0-9]{3}[^0-9]")  # accidental date references
one_num_regex = re.compile("1\\.\\s[a-z]")
g_num_regex = re.compile("G.[ ]?[0-9]")
i_num_regex = re.compile(r"[I1][ABC]\\.[ ]?[0-9]")  # I number title references
date_regex = re.compile("1[45][0-9][0-9]")

In [None]:
c_nums = entry_df["entry_text"].apply(lambda x: c_num_regex.finditer(x)).apply(lambda x: [x for x in x]).apply(lambda x: x if len(x) > 0 else None)
c_dates = entry_df["entry_text"].apply(lambda x: c_date_regex.finditer(x)).apply(lambda x: [x for x in x]).apply(lambda x: x if len(x) > 0 else None)

entry_df["c_nums"] = c_nums
entry_df["c_dates"] = c_dates

In [None]:
def exclude_date_matches(row):
    if row["c_dates"] and row["c_nums"]:
        for r in row["c_dates"]:
            date_span = set(range(*r.span()))
            accidental_date = [date_span.intersection(set(range(*x.span()))) for x in row["c_nums"]]
            
        clean_cnums = [x for x,y in zip(row["c_nums"], accidental_date) if not y]
        if clean_cnums:
            return clean_cnums
        else:
            return row["c_nums"]
    else:
        return row["c_nums"]

In [None]:
entry_df["clean_c_nums"] = entry_df.apply(exclude_date_matches, axis=1)

In [None]:
# pred_idx = entry_df["clean_cnums"].dropna().apply(lambda x: [x.span() for x in x]).index.difference(entry_df["let_cnums"].dropna().apply(lambda x: [x.span() for x in x]).index)

In [None]:
def find_matches(row, match_row):
    matches = []
    if not row[match_row]:
        return None
    for match in row[match_row]:
        matches.append(row["entry_text"][match.span()[0]:match.span()[1] + 20])
    return matches

In [None]:
i = 0
def twenty_plus_gen():
    global i
    i += 10
    return i

In [None]:
entry_df.apply(find_matches, match_row="clean_c_nums", axis=1).dropna().iloc[twenty_plus_gen()-10: twenty_plus_gen()]

In [None]:
idx = 4767
entry_df.loc[idx, "entry_text"]

In [None]:
vol = entry_df.loc[idx, "vol"]
col = int(entry_df.loc[idx, "xml"][-1])
jpg = entry_df.loc[idx, "xml"][:-2]
image_path = glob.glob(
    r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
    f"\\BMC_{vol} {col} column pages Transkribus export"
    f"\\*\\*\\*{jpg}.jpg"
)[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=image_path))
        break
    except:
        attempts += 1

In [None]:
try:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]
    
except IndexError:
    next_jpg = jpg[:-1] + str(int(jpg[-1]) + 1)
    if int(col) == 2:
        next_col == 4
    elif int(col) == 4:
        next_col == 2
    next_image_path = glob.glob(
        r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula"
        f"\\BMC_{vol} {next_col} column pages Transkribus export"
        f"\\*\\*\\*{next_jpg}.jpg"
    )[0]

In [None]:
attempts = 0
while attempts < 10:
    try:
        display(Image(filename=next_image_path))
        break
    except:
        attempts += 1