In [None]:
import pandas as pd
import pymupdf
import re

In [None]:
doc = pymupdf.open("../data/raw/emp.pdf") # open a document

In [None]:
len(doc)

In [None]:
text = {"desc": {}, "bl_list": {}}
for i, page in enumerate(doc):
    i = i + 1  # Just for ease when comparing indexing to the pdf pages
    # Printed page numbers are the numbers printed on the page (from title page, then i - 858)
    # Actual page numbers are the 1 - 886 numbers of the pages in the pdf, do not correspond to number on the page
    # Remember that all actual page numbers in the pdf are one greater than the Python indexing
    if i == 1:
        section = None
    if i == 126:  # Page 98
        section = "desc"
    if i == 596:  # Page 568
        section = None
    # if i == 834:  # Page 806
    #     section = "bl_list"
    # if i == 849:  # Page 820
    #     break

    if section:
        page_num = i - 28
        page_text = page.get_text() # get plain text (is in UTF-8)
        text[section][i] = page_text

In [None]:
assert len(text["desc"]) == 470

In [None]:
full_desc = ""
for k, v in text["desc"].items():
    full_desc += v

In [None]:
len(full_desc)

In [None]:
with open("../data/interim/full_description.txt", "w", encoding="utf8") as f:
    f.write(full_desc)

In [None]:
early_header = []
for i in range(126, 596, 2):
    early_header.append(text["desc"][i].split("\n")[0])

desc_header = []
for i in range(127, 597, 2):
    desc_header.append(text["desc"][i].split("\n")[0])
99+256
283
has_page_num = []
for i in range (126, 595):
    has_page_num.append(text["desc"][i].count(str(i - 28)))

In [None]:
pd.Series(has_page_num).value_counts().sort_index()

I've investigate the below, in each instance the header was not transcribed, rather than transcribed in the wrong place, so current logic to remove it if present holds

In [None]:
[(i, e) for i, e in enumerate(early_header) if "EARL" not in e]

In [None]:
emp_head_counts = pd.Series([e for e in early_header if "EARL" in e]).value_counts()
print(emp_head_counts.sum())
emp_head_counts

The below is just some extra mistranscribed lines due to lines at the top of the photocopy, DESCRIPTION appears on the third line

In [None]:
[(i, d) for i, d in enumerate(desc_header) if "DESC" not in d]

In [None]:
pd.Series([d for d in desc_header if "DESC" in d]).value_counts()

### Pre-treat mistranscriptions

In [None]:
# bad header
if text["desc"][383][0] == "_":
    text["desc"][383] = text["desc"][383][10:]
assert text["desc"][383][:5] == "DESCR"

### Parsing the first page

In [None]:
split = text["desc"][126].split("\n")
if split[1] == "It should be assumed that the author/editor ":
    text["desc"][126] = "\n".join(split[9:49] + split[58:])
assert text["desc"][126][:5] == "Abbas"

### Parse columns on remaining pages

In [None]:
def process_page(page, page_num):
    trim_space = page.replace(" \n", "\n")
    split = trim_space.split("\n")
    lines = [l for l in split if l]
    if "DESC" in lines[0] or "EARL" in lines[0]:
        lines = lines[1:]

    # 20 out of 469 pages where this isn't the case
    if lines.count(page_num) == 1:
        lines.remove(page_num)

    return lines

In [None]:
processed_pages = []
for i in range(126, 596):
    page_num = str(i - 28)
    page = text["desc"][i]
    processed_pages.append(process_page(page=page, page_num=page_num))

### Compare processed pages to ground truth

In [None]:
for i, p in enumerate(processed_pages[:5]):
    with open(f"../data/processed/ground_truth/p{i+1}_column_parse.txt", encoding="utf8") as f:
        gt = [l.strip("\n") for l in f.readlines()]
        print(i + 1)
        print([a for a,b in zip(gt, p) if a!=b])
        assert gt == p

### Look for letter page headings

In [None]:
a_re = re.compile(r"^C\s{0,2}$", flags=re.MULTILINE)
matches = []
for k, v in text["desc"].items():
    if a_re.findall(v):
        matches.append(v)

print(len(matches))

In [None]:
page_lengths = pd.Series([len(v) for v in text["desc"].values()])

In [None]:
page_lengths.hist(bins=20)

In [None]:
[print(i, "content:", p) for i, p in text["desc"].items() if len(p) < 1000]

In [None]:
a_re = re.compile(r"^C\s{0,2}$", flags=re.MULTILINE)

In [None]:
a_re.findall(full_desc)

In [None]:
full_desc.split("\nA \n")[1][-100:]

In [None]:
text["desc"][1100:1300]

In [None]:
print(text["desc"][1100:5000])

In [None]:
print(text["desc"][-1000:])

In [None]:
print(text["bl_list"][:2000])

In [None]:
print(text["bl_list"][-1000:])

#### Get AG's columns

In [None]:
# alphabet = {l:i for i,l in enumerate(sorted(list(set(full_desc)))[33:59])}

alphabet = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11,
    'M': 12,
    'N': 13,
    'O': 14,
    'P': 15,
    'Q': 16,
    'R': 17,
    'S': 18,
    'T': 19,
    'U': 20,
    'V': 21,
    'W': 22,
    'X': 23,
    'Y': 24,
    'Z': 25
}

sample_df = pd.read_csv("../data/external/ALEPH sample Bollinger EMP.csv", encoding="utf8", nrows=13)
sample_df.drop(index=2, inplace=True)
cols = ["A", "E", "Q", "U", "AB", "AC", "AD", "AH", "AJ", "AQ", "AR", "CM"]

All before 1887 purchased, 1888 onwards legal deposit - AG

In [None]:
col_nums = []
for c in cols:
    if len(c) == 1:
        col_nums += [alphabet[c]]
    if len(c) == 2:
        d1 = 26 * (alphabet[c[0]] + 1)
        d2 = alphabet[c[1]]
        col_nums += [d1 + d2]

In [None]:
sample_df.columns[col_nums]

In [None]:
gt_df = sample_df.iloc[2:, col_nums].copy()
gt_df