# Extracting catalogue entries from the Early Malay Printed Books catalogue

Collaboration with Annabel Gallop and Adi Keinan-Schoonbaert to create catalogue entries from the OCR of the Early Malay Printed Books catalogue (EMP). Match the extracted entries to a set of works AG is digitising to provide skeleton metadata records as part of the digitisation process.

In [None]:
from copy import copy
import json
from random import sample
import re

import pandas as pd
import pymupdf
from rapidfuzz import fuzz, utils, process

## Extract Description section

In [None]:
doc = pymupdf.open("../data/raw/emp.pdf") # open a document

In [None]:
len(doc)

In [None]:
text = {"desc": {}, "bl_list": {}, "titles": {}}
for i, page in enumerate(doc):
    i = i + 1  # Just for ease when comparing indexing to the pdf pages
    # Printed page numbers are the numbers printed on the page (from title page, then i - 858)
    # Actual page numbers are the 1 - 886 numbers of the pages in the pdf, do not correspond to number on the page
    # Remember that all actual page numbers in the pdf are one greater than the Python indexing
    if i == 1:
        section = None
    if i == 126:  # Page 98
        section = "desc"
    if i == 596:  # Page 568
        section = None
    if i == 711:  # Page 683
        section = "titles"
    if i == 802:  # Page 774
        break

    if section:
        page_num = i - 28
        page_text = page.get_text() # get plain text (is in UTF-8)
        text[section][i] = page_text

In [None]:
assert len(text["desc"]) == 470
assert len(text["titles"]) == 91

In [None]:
full_desc = ""
for k, v in text["desc"].items():
    full_desc += v

In [None]:
len(full_desc)

In [None]:
with open("../data/interim/full_description.txt", "w", encoding="utf8") as f:
    f.write(full_desc)

### Tidy desc page text

In [None]:
early_header = []
for i in range(126, 596, 2):
    early_header.append(text["desc"][i].split("\n")[0])

desc_header = []
for i in range(127, 597, 2):
    desc_header.append(text["desc"][i].split("\n")[0])

has_page_num = []
for i in range (126, 595):
    has_page_num.append(text["desc"][i].count(str(i - 28)))

In [None]:
pd.Series(has_page_num).value_counts().sort_index()

I've investigate the below, in each instance the header was not transcribed, rather than transcribed in the wrong place, so current logic to remove it if present holds

In [None]:
[(i, e) for i, e in enumerate(early_header) if "EARL" not in e]

In [None]:
emp_head_counts = pd.Series([e for e in early_header if "EARL" in e]).value_counts()
print(emp_head_counts.sum())
emp_head_counts

The below is just some extra mistranscribed lines due to lines at the top of the photocopy, DESCRIPTION appears on the third line

In [None]:
[(i, d) for i, d in enumerate(desc_header) if "DESC" not in d]

In [None]:
pd.Series([d for d in desc_header if "DESC" in d]).value_counts()

#### Pre-treat mistranscriptions

In [None]:
# bad header
if text["desc"][383][0] == "_":
    text["desc"][383] = text["desc"][383][10:]
assert text["desc"][383][:5] == "DESCR"

#### Parsing the first page

In [None]:
split = text["desc"][126].split("\n")
if split[1] == "It should be assumed that the author/editor ":
    text["desc"][126] = "\n".join(split[9:49] + split[58:])
assert text["desc"][126][:5] == "Abbas"

#### Parse columns on remaining pages

In [None]:
def process_desc_page(page, page_num):
    trim_space = page.replace(" \n", "\n")
    split = trim_space.split("\n")
    lines = [l for l in split if l]
    if "DESC" in lines[0] or "EARL" in lines[0]:
        lines = lines[1:]

    # Only 20 out of 469 pages where count != 1
    if lines.count(page_num) == 1:
        lines.remove(page_num)

    return lines

In [None]:
processed_desc_pages = []
for i in range(126, 596):
    page_num = str(i - 28)
    page = text["desc"][i]
    processed_desc_pages.append(process_desc_page(page=page, page_num=page_num))

#### Compare processed desc pages to ground truth

In [None]:
for i, p in enumerate(processed_desc_pages[:5]):
    with open(f"../data/processed/ground_truth/p{i+1}_column_parse.txt", encoding="utf8") as f:
        gt = [l.strip("\n") for l in f.readlines()]
        print(i + 1)
        print([a for a,b in zip(gt, p) if a!=b])
        assert gt == p

## Extract catalogue entries

### Tidy title page text

In [None]:
titles_early_header = []
for i in range(713, 802, 2):
    titles_early_header.append(text["titles"][i].split("\n")[0])

titles_title_header = []
for i in range(712, 802, 2):
    titles_title_header.append(text["titles"][i].split("\n")[0])

titles_has_page_num = []
for i in range (711, 802):
    titles_has_page_num.append(text["titles"][i].count(str(i - 28)))

In [None]:
pd.Series(titles_has_page_num).value_counts().sort_index()

I've investigated the below, the page has been mis-transcribed as having two columns and needs correcting

In [None]:
[(i, e) for i, e in enumerate(titles_early_header) if "EARL" not in e]

In [None]:
emp_head_counts = pd.Series([e for e in titles_early_header if "EARL" in e]).value_counts()
print(emp_head_counts.sum())
emp_head_counts

The below is just some extra mistranscribed lines due to lines at the top of the photocopy, DESCRIPTION appears on the third line

In [None]:
[(i, d) for i, d in enumerate(titles_title_header) if "LES" not in d]

In [None]:
pd.Series([d for d in titles_title_header if "LES" in d]).value_counts()

#### Pre-treat mistranscriptions

In [None]:
# bad reading order
text["titles"][724] = text["titles"][724].replace("\nTITI..ES ", "")
text["titles"][768] = text["titles"][768].replace("\nTfILES ", "")
text["titles"][770] = text["titles"][770].replace("\nTI1LES ", "")
text["titles"][774] = text["titles"][774].replace("\nTITLES ", "")

In [None]:
text["titles"][720][:1680]

In [None]:
# semi-accidental double column
# the excluded section contains no main works among collected ceretera/cerita/ceritera/cetera
if text["titles"][720][-5:] == "692 \n":
    text["titles"][720] = text["titles"][720][:1680]
    
# accidental double column
# the small amount of extracted is the only main work among collected ceretera/cerita/ceritera/cetera
if text["titles"][721][:5] == '"Chre':
    text["titles"][721] = text["titles"][721][2613:2657].replace("\n", "")

# accidental double columns
# the small amount of extracted is the only main work among collected ceretera/cerita/ceritera/cetera
if text["titles"][722][:5] == "TI1LE":
    text["titles"][722] = "Cerita Rampai-Rampai 1916 (t) - see also Abu Nawas 1917"

In [None]:
text["titles"][722][:5]

### Create list of titles

In [None]:
with open("../data/processed/ground_truth/raw_title_list_p683.txt", encoding="utf8") as f:
    raw_title_gt_lines_683 = [x.strip("\n") for x in f.readlines()]

with open("../data/processed/ground_truth/raw_title_list_p688.txt", encoding="utf8") as f:
    raw_title_gt_lines_688 = [x.strip("\n") for x in f.readlines()]

In [None]:
def process_titles_page(page, page_num):
    # trim_space = page.replace(" \n", "\n")
    continuing_date_p = re.compile(r"\n(\d{4,4})")
    continue_date = continuing_date_p.sub(r"\1", page)
    continue_dash = continue_date.replace("\n-\n", "- ").replace("-\n", "- ")
    continue_a = continue_dash.replace("\na ", "a ").replace("\na, ", "a, ")
    split = continue_a.split("\n")
    lines = [l.strip() for l in split if l]

    # Only 20 out of 469 pages where count != 1
    if lines.count(page_num) == 1:
        lines.remove(page_num)
    
    lines = lines[1:]
    return lines

In [None]:
text["titles"][711][833:]

In [None]:
processed_title_pages = []
for i in range(711, 802):
    page_num = str(i - 28)
    page = text["titles"][i]
    processed_title_pages.append(process_titles_page(page=page, page_num=page_num))

processed_title_pages[0] = processed_title_pages[0][9:]

In [None]:
len(processed_title_pages[5]), len(raw_title_gt_lines_688)

In [None]:
all_titles = []
for p in processed_title_pages:
    all_titles.extend(p)

In [None]:
len(all_titles)

#### Manual extraction of lines that should be concatenated but haven't

I went through all 4.3k title lines in all_titles and checked whether the line was a title or was a continuation of the previous line. Extracted all the lines that should be continued into lines_to_concatenate_with_text.  

Decided this was a good trade-off as not too many lines to extract (~4% error) vs adding new rules to find these errors. 

In [None]:
# Used this to check all_titles 1k at a time, scanning quickly through the first letters of titles to check
# took maybe half an hour
[str(i + 0) + " " + t for i,t in enumerate(all_titles[0:])]

In [None]:
with open("../data/processed/lines_to_concatenate_with_text.txt", encoding="utf8") as f:
    lines = [l.strip("\n").split("\t") for l in f.readlines()]
    bad_line_ids, bad_line_texts = [int(l[0]) for l in lines], [l[1] for l in lines]

In [None]:
# check that the text of the lines we're going to merge with other lines matches our expectation
assert all([all_titles[line_id] == text for line_id, text in zip(bad_line_ids, bad_line_texts)])

In [None]:
# check I've coded the lines right
for l in bad_line_ids:
    print(l)
    print("\n".join(all_titles[l-2:l+3]))
    print("\n")

In [None]:
all_titles_concatenated = copy(all_titles)
for l in bad_line_ids[::-1]:
    all_titles_concatenated[l-1] = all_titles_concatenated[l-1] + " " + all_titles_concatenated[l]

all_titles_corrected = []
for i, t in enumerate(all_titles_concatenated):
    if i not in bad_line_ids:
        all_titles_corrected += [t]

In [None]:
all_titles_corrected[62]

#### Select only main works

In [None]:
see_re = re.compile(r"see(?! also)")

In [None]:
see_re.search("hello see also")

In [None]:
main_works = []
for title in all_titles_corrected:
    if see_re.search(title) or "look" in title:
        continue    
    else:
        main_works.append(title)

# This cf is the only incorrect one not caught by the 'see' regex
main_works.remove('Adab Kesopanan bagi Orang Muda-Muda Anak yang Bangsawan - cf Adab aI-Fatiy 1916')

In [None]:
len(main_works)

In [None]:
# template for gt based on first ~30 works
"""
with open("../data/processed/ground_truth/28_main_titles.txt", "w", encoding="utf8") as f:
    for w in main_works[:30]:
        f.write(w + "\n")
"""
with open("../data/processed/ground_truth/28_main_titles.txt", encoding="utf8") as f:
    gt_main_works = [l.strip("\n") for l in f.readlines()]

In [None]:
assert all([w == gt_w for w, gt_w in zip(main_works, gt_main_works)])

In [None]:
works_date_re = re.compile(r"[ Â±]{1,2}[l0-9]{4,4}")
trailing_a_re = re.compile(r" a( |$)")

In [None]:
main_work_short_titles = []
for w in main_works:
    no_date = re.split(works_date_re, w)[0]
    no_a_ed = re.split(trailing_a_re, no_date)[0]
    clean_short_title = no_a_ed
    main_work_short_titles.append(clean_short_title)

# Some work are duplicated due to line breaks converting "see <name of work>" to "see\n<name of work>", in which case the work gets picked up again
main_work_short_title_df = pd.Series(main_work_short_titles).drop_duplicates().reset_index(drop=True)

In [None]:
main_work_short_title_df.to_csv("../data/processed/all_short_titles.csv", encoding="utf8")

#### Compare main works to titles from BL shelf list

In [None]:
aac_list = pd.read_csv("../data/external/Proudfoot-BL collection-6.10.25.csv", header=None, names=["shelfmark", "short_title", "year"], usecols=[0,1,2])

In [None]:
trailing_abc_re = re.compile(r" [abc] ?$")

In [None]:
def clean_short_title(row):
    no_date = re.split(works_date_re, row["short_title"])[0]
    no_a_ed = re.split(trailing_abc_re, no_date)[0]
    clean_short_title = no_a_ed
    return clean_short_title

In [None]:
aac_list["short_title_no_year"] = aac_list.apply(clean_short_title, axis=1)

435 unique titles in the AAC list. 174 of these don't appear identically in the main works list extracted from the titles list.

In [None]:
matched_works = [(w, w) for w in aac_list["short_title_no_year"].unique() if w in main_work_short_titles]
missing_works = [w for w in aac_list["short_title_no_year"].unique() if w not in main_work_short_titles]
len(missing_works), len(aac_list["short_title_no_year"].unique())

Check all missing works from the AAC list against the entire main works list, using the basic rapidfuzz ration returning up to 3 matches.

In [None]:
missing_work_matches = []
for w in missing_works:
    matches = process.extract(w, main_work_short_titles, scorer=fuzz.ratio, limit=3, processor=utils.default_process)
    missing_work_matches.append([w, matches])

accepted_matches = []
failed_matches = []
for w, matches in missing_work_matches:
    if matches[0][1] >= 90:
        accepted_matches.append((w, matches[0][0], matches[0][1]))
    else:
        failed_matches.append((w, matches))

In [None]:
len(missing_works) - len(accepted_matches), len(accepted_matches), len(missing_works), len(aac_list["short_title_no_year"].unique())

In [None]:
accepted_matches

In [None]:
failed_matches

In [None]:
matched_works += [(w[0], w[1]) for w in accepted_matches]
matched_works.sort()

#### Matched work ground truth

In [None]:
# sample = sample(matched_works, 50)
# with open("../data/processed/ground_truth/50_matched_works.txt", "w", encoding="utf8") as f:
#     for s in sample:
#         f.write(f"{s[0]}, {s[1]}\n")

with open("../data/processed/ground_truth/50_matched_works.txt", encoding="utf8") as f:
    gt_titles = [tuple(l.strip("\n").split(", ")) for l in f.readlines()]
    # gt_titles = [(w[0], w[1]) for w in ]

In [None]:
assert all([t in matched_works for t in gt_titles])

### Create catalogue entry ground truth

In [None]:
all_processed_lines = []
line_page_lookup = {}
line_count = 0
for i, p in enumerate(processed_desc_pages):
    all_processed_lines += p

    for j, l in enumerate(p):
        line_page_lookup[j + line_count] = i + 98
    
    line_count += len(p)

In [None]:
# Recreate OCR errors from text in the GT
entry_gt = {
    "Abbas": all_processed_lines[1:42],
    "Abdau": all_processed_lines[43:91],
    "Abdullah": all_processed_lines[92:572],
    "Abdullah dan Sabat": all_processed_lines[573:677],
    "AbdulMuluk": all_processed_lines[678:1091]
}

# with open("../data/processed/ground_truth/entry.json", "w", encoding="utf8") as f:
#     json.dump(entry_gt, f, indent=4)

# with open("../data/processed/ground_truth/entry.json", encoding="utf8") as f:
#     entry_gt = json.load(f)

In [None]:
for name, lines in entry_gt.items():
    print(lines[-10:])

### Search for main work short titles in all description lines

#### Longest catalogue entry analysis to provide limit for difference between min_line and max_line

main_work_short_titles contains (to a degree of accuracy) all the short titles in the description section of the EMP. Use these to split up the lines of the description section into individual catalogue entries. Due to the quality of OCR in the description section I expect not to find a reasonable number of the main work short titles. Will have to use fuzzy string matching/manual intervention for the rest.

In [None]:
title_locs = []
title_line_tracker = 0  # This has to be accurate for it to work, otherwise can get too large too quickly
# only use title_line_tracker as validity check once location of all headings confirmed

# TODO work out longest entry to use that as a limit on how far ahead to search for the next heading
# Answer: 2000 is ~2x the 95% of +ve valid hits using naive search algorithm (which includes some very large incorrect values)

for w in main_work_short_title_df:
    line_window = all_processed_lines[title_line_tracker: title_line_tracker + 2000]
    if w in line_window:
        line_loc = line_window.index(w) + title_line_tracker
        title_locs.append((w, None, line_loc, title_line_tracker, title_line_tracker + 2000))
        title_line_tracker = line_loc
    else:
        title_locs.append((w, None, None, title_line_tracker, title_line_tracker + 2000))

title_loc_df = pd.DataFrame(title_locs, columns=["short_title", "short_title_ocr_alias", "line_start", "min_line", "max_line"])
title_loc_df["line_start"] = title_loc_df["line_start"].astype("Int64")

In [None]:
valid_lengths = title_loc_df["line_start"].iloc[1:] - title_loc_df["line_start"].shift(1).dropna()
valid_lengths[valid_lengths >= 0].dropna().describe(percentiles=[0.9,0.95])
valid_lengths[valid_lengths >= 0].dropna().hist(bins=20)

In [None]:
# Just shy of 600 missing values at the moment
title_loc_df.info()

In [None]:
out_of_sequence_match = title_loc_df["line_start"].dropna().iloc[:-1][~((title_loc_df["line_start"].dropna().iloc[:-1] - title_loc_df["line_start"].dropna().shift(-1)).dropna() < 0)]
assert out_of_sequence_match.empty

In [None]:
def find_nearest_line(row, all_lines):
    possible_lines = all_lines[row["min_line"]:row["max_line"]]
    if row["line_start"] is pd.NA:
        nearest_line = process.extract(row["short_title"], possible_lines, scorer=fuzz.ratio, limit=1, processor=utils.default_process)[0]
        return (nearest_line[0], nearest_line[1], nearest_line[2] + row["min_line"])
    else:
        return (row["short_title"], 100.0, row["line_start"])

In [None]:
title_loc_df.iloc[480:520]

In [None]:
nearest_apply = title_loc_df.apply(find_nearest_line, all_lines=all_processed_lines, axis=1)
title_loc_df["nearest_line"] = nearest_apply.apply(lambda x: x[0])
title_loc_df["similarity"] = nearest_apply.apply(lambda x: x[1])
title_loc_df["nearest_line_idx"] = nearest_apply.apply(lambda x: x[2])

In [None]:
# >90% similarity matches for 63.5% of unmatched names
title_loc_df["similarity"].describe(percentiles=[0.345,0.6,0.65,0.7])

In [None]:
title_loc_df.loc[title_loc_df["similarity"] >= 90, "short_title_ocr_alias"] = title_loc_df.loc[title_loc_df["similarity"] >= 90, "short_title"]
title_loc_df.loc[title_loc_df["similarity"] >= 90, "line_start"] = title_loc_df.loc[title_loc_df["similarity"] >= 90, "nearest_line_idx"]

In [None]:
# After selecting >90% matches, 250 more matches, leaving 324 unmatched
title_loc_df.info()

#### Manually check unmatched titles

In [None]:
missing_with_adjacent = []
for t in title_loc_df.loc[title_loc_df["line_start"].isna()].index:
    missing_with_adjacent += [t-1, t, t+1]

In [None]:
missing_with_adjacent_df = title_loc_df.loc[sorted(list(set(missing_with_adjacent)))[:-1]]
missing_with_adjacent_df["min_line_page"] = missing_with_adjacent_df["min_line"].map(line_page_lookup)
missing_with_adjacent_df.to_csv("../data/interim/missing_title_adjacent.csv", encoding="utf8")

In [None]:
missing_with_adjacent_df

In [None]:
manual_check_df = pd.read_csv("../data/interim/missing_title_adjacent_manual_check.csv", encoding="UTF8", index_col=0)

# check all missing titles have been manually checked
assert (~manual_check_df[manual_check_df["line_start"].isna()]["approve"].isna()).all()

# check all line_start match nearest_line_idx
assert (manual_check_df.dropna(subset="line_start")["line_start"].astype(int) == manual_check_df.dropna(subset="line_start")["nearest_line_idx"]).all()

In [None]:
# 297 corrected, 29 to exclude
manual_check_df["approve"].value_counts()

In [None]:
manually_approved_df = manual_check_df[manual_check_df["approve"] != -1]
manually_approved_idx = manually_approved_df.index
to_exclude_idx = manual_check_df[manual_check_df["approve"] == -1].index

In [None]:
title_loc_df.loc[manually_approved_idx, "nearest_line"] = manually_approved_df["nearest_line_idx"].apply(lambda x: all_processed_lines[x])
title_loc_df.loc[manually_approved_idx, "nearest_line_idx"] = manually_approved_df["nearest_line_idx"]

title_loc_df.loc[manually_approved_idx, "line_start"] = title_loc_df["nearest_line_idx"]
title_loc_df.loc[manually_approved_idx, "short_title_ocr_alias"] = title_loc_df["nearest_line"]

In [None]:
title_loc_df.drop(index=to_exclude_idx, inplace=True)

In [None]:
title_loc_df.info()

In [None]:
title_loc_df["entry_start"] = title_loc_df["nearest_line_idx"]
title_loc_df["entry_end"] = title_loc_df["nearest_line_idx"].shift(-1).astype("Int64") - 1

#### Manual fixes identified

In [None]:
title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'IlmuFalak'").index, "entry_start"] = 18278 - 2  # Fix an entry starting two lines late due to bad title OCR
title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'Ilmu Bintang'").index, "entry_end"] = 18277 - 2

title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'Sirat al-Mustakim'").index, "entry_start"] = 41676 - 1  # Fix an entry starting two lines late due to bad title OCR
title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'Slraj aI-KalbI'").index, "entry_end"] = 41675 - 1

In [None]:
title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'Akhbar'").index, "entry_end"] = 2520 - 64
title_loc_df.loc[title_loc_df.query("short_title == 'Akidat al-Munajjin'").index, "entry_start"] = 2521 - 64  # Fix an entry starting late due to bad title OCR
title_loc_df.loc[title_loc_df.query("short_title == 'Akidat al-Munajjin'").index, "entry_end"] = 2540 + 1  # Fix an entry starting late due to bad title OCR
title_loc_df.loc[title_loc_df.query("short_title_ocr_alias == 'Alauddln'").index, "entry_start"] = 2541 + 1

In [None]:
title_loc_df.iloc[-1, -1] = 51208  # Manually correct end of final entry

In [None]:
title_loc_df.tail()

In [None]:
for gt_title, gt_text in entry_gt.items():
    start, end = title_loc_df.query(f"nearest_line == '{gt_title}'")[["entry_start", "entry_end"]].iloc[0]
    extracted_text = all_processed_lines[start + 1:end + 1]
    assert gt_text == extracted_text

Manual fix for a section that hadn't been picked up due to the title_line_tracker being pushed too far

In [None]:
# manual_title_locs = []
# manual_title_line_tracker = 28825  # This has to be accurate for it to work, otherwise can get too large too quickly
# # only use title_line_tracker as validity check once location of all headings confirmed

# # TODO work out longest entry to use that as a limit on how far ahead to search for the next heading
# # Answer: 2000 is ~2x the 95% of +ve valid hits using naive search algorithm (which includes some very large incorrect values)

# for w in main_work_short_title_df.loc[505:525]:
#     line_window = all_processed_lines[manual_title_line_tracker: manual_title_line_tracker + 2000]
#     if w in line_window:
#         line_loc = line_window.index(w) + manual_title_line_tracker
#         manual_title_locs.append((w, None, line_loc, manual_title_line_tracker, manual_title_line_tracker + 2000))
#         manual_title_line_tracker = line_loc
#     else:
#         manual_title_locs.append((w, None, None, manual_title_line_tracker, manual_title_line_tracker + 2000))

# manual_title_loc_df = pd.DataFrame(manual_title_locs, columns=["short_title", "short_title_ocr_alias", "line_start", "min_line", "max_line"])
# manual_title_loc_df["line_start"] = manual_title_loc_df["line_start"].astype("Int64")

# manual_nearest_apply = manual_title_loc_df.apply(find_nearest_line, all_lines=all_processed_lines, axis=1)
# manual_title_loc_df["nearest_line"] = manual_nearest_apply.apply(lambda x: x[0])
# manual_title_loc_df["similarity"] = manual_nearest_apply.apply(lambda x: x[1])
# manual_title_loc_df["nearest_line_idx"] = manual_nearest_apply.apply(lambda x: x[2])

In [None]:
all_processed_lines[51200:51208]

#### Map titles to canonical titles

In [None]:
title_loc_df["correct_title"] = title_loc_df["short_title"]
title_loc_df.loc[title_loc_df[title_loc_df["short_title"] == "I1mu Falak"].index, "correct_title"] = "Ilmu Falak"

#### Check catalogue entry coverage

In [None]:
# Easy manual approval reduces remaining to 81
title_loc_df.info()

In [None]:
entry_lines_set = set()
for s in title_loc_df.dropna(subset="line_start").apply(lambda x: set(range(x["entry_start"], x["entry_end"] + 1)), axis=1).values:
    entry_lines_set |= s

In [None]:
# all but two lines included in title_loc_df entries
print(len(all_processed_lines), len(entry_lines_set))
set(range(0, 51207)) - entry_lines_set

In [None]:
all_processed_lines[26005: 26007]

In [None]:
title_loc_df.tail()

In [None]:
title_loc_df["entry_text"] = title_loc_df.apply(lambda x: "\n".join(all_processed_lines[x["entry_start"]: x["entry_end"] + 1]), axis=1)

In [None]:
raw_entry_text_df = title_loc_df.rename(columns={"short_title": "titles_ocr_short_title", "short_title_ocr_alias": "desc_ocr_short_title"})[["titles_ocr_short_title", "desc_ocr_short_title", "correct_title", "entry_start", "entry_end", "entry_text"]]
# prog_report_df.to_csv("../data/processed/raw_entry_text.csv", encoding="utf-8-sig")

In [None]:
# raw_entry_text = pd.read_csv("../data/processed/raw_entry_text.csv", encoding="utf-8-sig")
raw_entry_text_df.loc[24, "entry_text"]

### Parse ground truth catalogue entry

#### Parse AG ground truth sample

In [None]:
raw_ag_entry_gt_df = pd.read_csv("../data/external/ALEPH_sample_Bollinger_EMP.csv", encoding="utf8")
raw_ag_entry_gt_df.drop(index=2, inplace=True)
raw_ag_entry_gt_df = raw_ag_entry_gt_df.iloc[2:]
cols = ["A", "E", "Q", "U", "AB", "AC", "AD", "AH", "AJ", "AQ", "AR", "CM"]
col_nums = [0, 4, 16, 20, 27, 28, 29, 33, 35, 42, 43, 90]
ag_entry_gt_df = raw_ag_entry_gt_df.iloc[:, col_nums].reset_index(drop=True)
ag_entry_gt_df.loc[5, "Bibliography etc. note"] = "Proudfoot 1993: San Guo 1892-96"  # Functionally the same, this is how it's listed in EMP

In [None]:
raw_ag_entry_gt_df.iloc[:, 24:]

In [None]:
gt_title_edition = ag_entry_gt_df["Bibliography etc. note"].str.split(": ").apply(lambda x: x[1]).str.split(" ").apply(lambda x: (" ".join(x[:-1]), x[-1]))
gt_title_edition = pd.DataFrame(data={"target_title": gt_title_edition.apply(lambda x: x[0]), "target_edition": gt_title_edition.apply(lambda x: x[1])})
gt_title_edition.sort_values(by="target_title")

In [None]:
gt_entry_text_df = pd.merge(left=raw_entry_text_df, right=gt_title_edition, left_on="correct_title", right_on="target_title")
gt_entry_text_df

#### Create prompt

In [None]:
json_schema = {
  "$schema": "http://json-schema.org/draft-04/schema#",
  "type": "object",
  "properties": {
    "editions": {
      "type": "array",
      "items": [
        {
          "type": "object",
          "properties": {
            "edition_name": {
              "type": "string"
            },
            "title": {
              "type": "string"
            },
            "author": {
              "type": "string"
            },
            "editor": {
              "type": "string"
            },
            "translator": {
              "type": "string"
            },
            "assistant_translator": {
              "type": "string"
            },
            "proprietor": {
              "type": "string"
            },
            "publisher": {
              "type": "string"
            },
            "printer": {
              "type": "string"
            },
            "copyist": {
              "type": "string"
            },
            "contents": {
              "type": "string"
            },
            "place_of_publication": {
              "type": "string"
            },
            "printing_medium": {
              "type": "string"
            },
            "script": {
              "type": "string"
            },
            "dimensions": {
              "type": "string"
            },
            "extent": {
              "type": "string"
            },
            "Notes": {
              "type": "string"
            },
            "References": {
              "type": "string"
            },
            "Location": {
              "type": "string"
            },
            "unclassified_text": {
              "type": "string"
            }
          },
          "required": [
            "edition_name",
            "title",
            "author",
            "editor",
            "translator",
            "assistant_translator",
            "proprietor",
            "publisher",
            "printer",
            "copyist",
            "contents",
            "place_of_publication",
            "printing_medium",
            "script",
            "dimensions",
            "extent",
            "Notes",
            "References",
            "Location",
            "unclassified_text"
          ]
        }
      ]
    }
  },
  "required": [
    "editions"
  ]
}

In [None]:
def gen_prompt(entry_text, book_title, json_schema):
    prompt = f"""Please extract structured metadata from the following text. The text is an entry for a particular book from a catalogue of books printed before 1925 in Malaysia.
    The text has been extracted from a pdf using optical character recognition and may contain errors. Do not correct these errors, but attempt to understand the correct words when extracting information.
    The text is split using line breaks. These separate lines in the OCR, but extra, unnecessary line breaks have sometimes been added between text from the same line.
    Each book entry begins with the book title, then is split into one or more editions. Each edition starts with an edition name in one of three formats:
    1) A year
    2) A year followed by a full stop then a letter (if there are multiple editions for one year)
    3) A letter (if the date of publication is unknown)

    The text for each edition normally reprints the edition date within it. The text for each edition contains different fields you should extract.
    These fields are marked by the field heading, and fields may run over multiple lines. All text before the next field heading belongs to that field. Not every entry has every field.
    Field headings are case insensitive. The Reference and Location fields are not usually followed by a colon. The other fields are followed by a colon.
    Sometimes fields are combined, such as 'author & proprietor', or 'publisher & printer'. In these cases repeat the information in text in the author and proprietor fields of the output.
    Field headings are:
    - author
    - editor
    - translator
    - assistant translator
    - proprietor
    - publisher
    - printer
    - copyist
    - contents
    - Notes
    - Reference(s)
    - Location(s)

    There is text between the edition name and the first field. There may also be text between fields that does not belong to that field. Both these types of text should be treated together as follows.
    This text may contain a title, a place of publication, the date of publication, the printing medium, the script of the text, the number of pages, the number of volumes, the dimensions of the edition.
    If the title is missing use the title provided later on in this prompt, otherwise use the title from the text. Use this text to extract the following fields:
    - title
    - place_of_publication
    - printing_medium
    - script
    - dimensions
    - extent (the number of volumes and number of pages) 
    
    Please extract the following information in json format. Only use the fields listed below. Not every entry has every field. If a field is missing represent it as <empty> in the output json.
    - edition name
    - title
    - author
    - editor
    - translator
    - assistant translator
    - publisher
    - printer
    - copyist
    - contents
    - place_of_publication
    - printing_medium
    - script
    - dimensions
    - extent (the number of volumes and number of pages) 
    - notes
    - references
    - locations
    Any text not in these fields include in the output json as a seperate field called 'unclassified_text'
        
    Please format the output as valid json using the schema below. Make sure to provide a valid and well-formatted JSON adhering to the given schema. Do not make up any information, only use what is provided in the text.
    {json_schema}    

    First, split the text into editions using the edition names, then assign the text for each edition to the appropriate fields.
    The title of this book is: {book_title}
    
    Book entry text:
    {entry_text}
    """
    return prompt

In [None]:
# for name, (_, title, _) in gt_entry_text_df.loc[:, ["entry_text", "correct_title", "target_edition"]].iterrows():
#     fout = title.lower().replace(" ", "_")
#     with open(f"../data/processed/model_outputs/{fout}.json", "w") as f:
#         f.write("")

In [None]:
# for name, (text, title, target_ed) in gt_entry_text_df.loc[:, ["entry_text", "correct_title", "target_edition"]].iterrows():
#     prompt = gen_prompt(entry_text=text, book_title=title, json_schema=json_schema)
#     fout = title.lower().replace(" ", "_")
#     with open(f"../models/prompts_outputs/{fout}_prompt.txt", "w", encoding="utf8") as f:
#         f.write(prompt)

In [None]:
prompt_log = pd.read_csv("../models/prompts_outputs/prompt_output_log.csv", encoding="utf8", index_col=0)
prompt_log

In [None]:
550k token

In [None]:
661 * 250 + 372400

In [None]:
title_loc_df.shape

In [None]:
title_loc_df["entry_text"].str.replace("\n", " ").apply(len).sum() * (250/918)

In [None]:
len(prompt.split()[:-193])

In [None]:
thinking = False
model = "Qwen3-235B-A22B-2507"
notes = "entry text start fix"

#### Semi-auto logging

In [None]:
new_logs = []
for name, (text, title, target_ed) in gt_entry_text_df.loc[3:, ["entry_text", "correct_title", "target_edition"]].iterrows():
    print(title)
    prompt = gen_prompt(entry_text=text, book_title=title, json_schema=json_schema)
    output = json.load(open(f"../data/processed/model_outputs/{title.lower().replace(" ", "_")}.json"))
    new_log = pd.DataFrame(data={'model': model, 'prompt': prompt, 'output': str(output), 'thinking': thinking, "notes": notes}, index=[prompt_log.index[-1] + 1])
    new_logs.append(new_log)

new_log_df = pd.concat(new_logs)
updated_log = pd.concat([prompt_log, new_log_df]).reset_index(drop=True)

#### Manual logging

In [None]:
fout = "sirat_al-mustakim"

with open(f"../models/prompts_outputs/{fout}_prompt.txt", encoding="utf8") as f:
        prompt = f.read()
    
output = json.load(open(f"../data/processed/model_outputs/{fout}.json"))

new_log = pd.DataFrame(data={'model': model, 'prompt': prompt, 'output': str(output), 'thinking': thinking, "notes": notes}, index=[prompt_log.index[-1] + 1])
updated_log = pd.concat([prompt_log, new_log])
updated_log

In [None]:
updated_log.to_csv("../models/prompts_outputs/prompt_output_log.csv", encoding="utf8")

#### Parse model output

In [None]:
gt_entry_text_df

In [None]:
def extract_bl_shelfmark(location_str):
    shelfmark_re = re.compile(r"\d+\.[a-z]\.[\w()]+")
    locations = location_str.split(";")
    bl_loc = [loc for loc in locations if "BL" in loc][0]
    sm = shelfmark_re.search(bl_loc).group()
    return sm

In [None]:
metadata_lines = []
for target_ed, title in gt_entry_text_df[["target_edition", "correct_title"]].values:
    output = json.load(open(f"../data/processed/model_outputs/gt_outputs/{title.lower().replace(" ", "_")}.json"))
    eds = [e["edition_name"] for e in output["editions"]]
    print(eds)
    for e in output["editions"]:
        if e["edition_name"] == target_ed:
            shelfmark = extract_bl_shelfmark(e["Location"])
            try:
                date_1 = str(int(e["edition_name"].split(".")[0]))
                date_of_publication_in_arabic_or_roman_numerals = str(int(e["edition_name"].split(".")[0]))
            except ValueError:
                date_1 = target_ed.split(".")[0]
                date_of_publication_in_arabic_or_roman_numerals = target_ed.split(".")[0]
            name = e["author"]
            extracted_title = e["title"]
            place_of_publication = e["place_of_publication"]
            publisher = e["publisher"]
            extent = e["extent"]
            dimensions = e["dimensions"]
            general_notes = e["printing_medium"]
            bibliography_etc_note = f"Proudfoot 1993: {title} {e["edition_name"]}"
            unclassified_text = e["unclassified_text"]
            method_of_acquisition = None

            metadata = pd.DataFrame(
                data={
                    "shelfmark": shelfmark,
                    "date_1": date_1,
                    "name": name,
                    "title": extracted_title,
                    "place_of_publication": place_of_publication,
                    "publisher": publisher,
                    "date_of_publication_in_arabic_or_roman_numerals": date_of_publication_in_arabic_or_roman_numerals,
                    "extent": extent,
                    "dimensions": dimensions,
                    "general_notes": general_notes,
                    "bibliography_etc_note": bibliography_etc_note,
                    "method_of_acquisition": method_of_acquisition,
                    "unclassified_text": unclassified_text
                },
                index = [0]
            )
            metadata_lines.append(metadata)
            break
    else:
        print(f"{target_ed} absent for {title} ")

In [None]:
metadata_lines_df = pd.concat(metadata_lines).sort_values(by="date_1")
metadata_lines_df.columns = ag_entry_gt_df.columns.to_list() + ["Unclassified content"]
gt_comparison_df = pd.concat([raw_ag_entry_gt_df.loc[3:3], metadata_lines_df], sort=False).iloc[1:].loc[:, raw_ag_entry_gt_df.columns.to_list() + ["Unclassified content"]].reset_index(drop=True)

In [None]:
gt_comparison_df

In [None]:
# gt_comparison_df.to_csv("../data/processed/ground_truth_output_batch.csv", index=False, encoding="utf8")

Will likely have to have a second post-processing step where ask the model to extract putative publication location and publication date from the unclassified text if contained in square brackets.

### Parse non GT catalogue entries

In [None]:
further_ten_sample = ["Faiz al-Rahman 1894",
"Abdau 1896",
"Akidat al-Munajjin 1893",
"Durrat al-Mudhiyat 1893",
"Barzanji Makna Bugis 1896",
"Maulud 1871.a",
"Maulud 1871.b",
"Safinah 1873.b",
"Saif Allah 1900",
"Ghayat al-Takrib 1893"]

In [None]:
sample_df = pd.DataFrame(further_ten_sample)
sample_df.columns = ["raw_text"]

sample_df["target_title"] = sample_df["raw_text"].apply(lambda x: " ".join(x.split()[:-1]))
sample_df["target_edition"] = sample_df["raw_text"].apply(lambda x: x.split()[-1])
sample_df.drop(columns="raw_text", inplace=True)

In [None]:
non_gt_entry_text_df = pd.merge(left=raw_entry_text_df, right=sample_df, left_on="correct_title", right_on="target_title")
non_gt_entry_text_df

In [None]:
for name, (text, title, target_ed) in non_gt_entry_text_df.loc[:, ["entry_text", "correct_title", "target_edition"]].iterrows():
    prompt = gen_prompt(entry_text=text, book_title=title, json_schema=json_schema)
    fout = title.lower().replace(" ", "_")
    with open(f"../models/prompts_outputs/{fout}_prompt.txt", "w", encoding="utf8") as f:
        f.write(prompt)
    # with open(f"../data/processed/model_outputs/{fout}.json", "w") as f:
    #     f.write("")

In [None]:
non_gt_metadata_lines = []
for target_ed, title in non_gt_entry_text_df[["target_edition", "correct_title"]].values:
    output = json.load(open(f"../data/processed/model_outputs/non_gt_outputs/{title.lower().replace(" ", "_")}.json"))
    
    eds = [e["edition_name"] for e in output["editions"]]
    print(eds)
    for e in output["editions"]:
        if e["edition_name"] == target_ed:
            try:
                shelfmark = extract_bl_shelfmark(e["Location"])
            except AttributeError:
                shelfmark = None
            try:
                date_1 = str(int(e["edition_name"].split(".")[0]))
                date_of_publication_in_arabic_or_roman_numerals = str(int(e["edition_name"].split(".")[0]))
            except ValueError:
                date_1 = target_ed.split(".")[0]
                date_of_publication_in_arabic_or_roman_numerals = target_ed.split(".")[0]
            name = e["author"]
            extracted_title = e["title"]
            place_of_publication = e["place_of_publication"]
            publisher = e["publisher"]
            extent = e["extent"]
            dimensions = e["dimensions"]
            general_notes = e["printing_medium"]
            bibliography_etc_note = f"Proudfoot 1993: {title} {e["edition_name"]}"
            unclassified_text = e["unclassified_text"]
            method_of_acquisition = None

            metadata = pd.DataFrame(
                data={
                    "shelfmark": shelfmark,
                    "date_1": date_1,
                    "name": name,
                    "title": extracted_title,
                    "place_of_publication": place_of_publication,
                    "publisher": publisher,
                    "date_of_publication_in_arabic_or_roman_numerals": date_of_publication_in_arabic_or_roman_numerals,
                    "extent": extent,
                    "dimensions": dimensions,
                    "general_notes": general_notes,
                    "bibliography_etc_note": bibliography_etc_note,
                    "method_of_acquisition": method_of_acquisition,
                    "unclassified_text": unclassified_text
                },
                index = [0]
            )
            non_gt_metadata_lines.append(metadata)
            break
    else:
        print(f"{target_ed} absent for {title} ")

In [None]:
non_gt_metadata_lines_df = pd.concat(non_gt_metadata_lines).sort_values(by="date_1")
non_gt_metadata_lines_df.columns = ag_entry_gt_df.columns.to_list() + ["Unclassified content"]
non_gt_comparison_df = pd.concat([raw_ag_entry_gt_df.loc[3:3], non_gt_metadata_lines_df], sort=False).iloc[1:].loc[:, raw_ag_entry_gt_df.columns.to_list() + ["Unclassified content"]].reset_index(drop=True)
non_gt_comparison_df

In [None]:
non_gt_comparison_df.to_csv("../data/processed/non_ground_truth_output_batch.csv", encoding="utf8", index=False)

#### Get AG's columns

In [None]:
# alphabet = {l:i for i,l in enumerate(sorted(list(set(full_desc)))[33:59])}

alphabet = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11,
    'M': 12,
    'N': 13,
    'O': 14,
    'P': 15,
    'Q': 16,
    'R': 17,
    'S': 18,
    'T': 19,
    'U': 20,
    'V': 21,
    'W': 22,
    'X': 23,
    'Y': 24,
    'Z': 25
}

sample_df = pd.read_csv("../data/external/ALEPH_sample_Bollinger_EMP.csv", encoding="utf8")
sample_df.drop(index=2, inplace=True)
cols = ["A", "E", "Q", "U", "AB", "AC", "AD", "AH", "AJ", "AQ", "AR", "CM"]
col_nums = [0, 4, 16, 20, 27, 28, 29, 33, 35, 42, 43, 90]

All before 1887 purchased, 1888 onwards legal deposit - AG

In [None]:
col_nums = []
for c in cols:
    if len(c) == 1:
        col_nums += [alphabet[c]]
    if len(c) == 2:
        d1 = 26 * (alphabet[c[0]] + 1)
        d2 = alphabet[c[1]]
        col_nums += [d1 + d2]

In [None]:
col_nums

In [None]:
sample_df.columns[col_nums]

In [None]:
gt_df = sample_df.iloc[2:, col_nums].copy()
gt_df