In [4]:
import re
import pandas as pd

In [9]:
entry_df = pd.read_csv("../data/processed/incu_ocr_bll_matching_v1.4.csv", index_col=0)

In [13]:
entry_df.shape

(8774, 11)

In [14]:
copy_re = re.compile("Another \S*")
anothers = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
copy_variants = sorted(list(set(anothers.apply(lambda x: x.group() if x else None).dropna())))
copy_variants

entry_df["match"] = entry_df["entry_text"].apply(lambda x: copy_re.search(x))
# entry_df["preceding_shelfmark"] = entry_df.apply(check_for_leading_shelfmark, axis=1)

In [27]:
print(entry_df.loc[5400, "entry_text"])

THEOCRITUS. EiSvANta. (With other tracts)
February, 14956.
r. TITLE: TAAE ENEETI'ENTHIAETHIBIBAOL.
Oeonoiro id  a  ra
... (l. 18) Haec insunt in hoc libro. Theocriti Ecloga
triginta. Genus Theocriti & de inuentione bucolicorum.
Catonis Romani sententiae paræneticae distichi.e
tentiae septem sapientum. De Inuidia. Theognidis
megarensis siculi sententiae elegiacae. Sententiae mono¬
stichi per Capita ex uariis poetis. Aurea Carmina
Pythagoroe. Phocylidae Poema admonitorium. Carmina
Sibyllæ erythroea de Christo lesu domino nro. Differetia
uocis. Hesiodi Theogonia. Eiusdem scutum Herculis.
Eiusdem georgicon libri duo. 1408. COLOPHON : Im¬
pressum Venetiis characteribus ac studio Aldi Manuc
Ro mani cum gratia &c. . M. CCCC. XCV. Mense febru¬
ario. 1406. TITLE: In hoc volumine continentur haec
Hesiodi Ascraei poetae Theogonia. hoc est de generatione
deoru opusculum. Eiusde Aspis. hoc est de scuto
Herculis opusculum. Eiusdem Georgicorum libri duo dicti
Erga & Himerae. idest opera & dies.
Folio

In [24]:
entry_df["entry_text"].str.count("Another copy").idxmax()

5400

In [18]:
entry_df["match"].dropna().apply(lambda x: x.group()).unique()

array(['Another edition.', 'Another edition,', 'Another issue.',
       'Another copy', 'Another copy.', 'Another cancelled',
       'Another issue,', 'Another copy,', 'Another issue',
       'Another calendar', 'Another fragment,', 'Another setting-up',
       'Another edition', 'Another (crown', 'Another metrical',
       'Another reading.', 'Another cut', 'Another calligraphic',
       'Another setting', 'Another closely', 'Another full-page',
       'Another version', 'Another recension', 'Another compartment'],
      dtype=object)

In [None]:
x = 2
print(copy_variants[x])
entry_df[entry_df["entry_text"].str.contains(copy_variants[x])]

Another calligraphic


Unnamed: 0,xmls,vol,col_pages,vol_entry_num,shelfmark,bll01_shelfmark,record_id,uncaptured_sm,entry_text,entry_text_spaces,match
7681,"['J_2704_aa_30_8_0260_4', 'J_2704_aa_30_8_0261...",8,"4-36, 4-37,",463,IA. 40393,IA. 40393,1151969,0.0,DATES. Tréperel is not known to have printed a...,DATES. Tréperel is not known to have printed a...,"<re.Match object; span=(3670, 3690), match='An..."


All the matches of the "Another \S*" regex with statement as to whether consists of actual 'Another edition' information.

'Another (crown': Not valid, referring to watermarks in the text  
'Another calendar': Not valid, referring to calendars in the work  
'Another calligraphic': Not valid, referring to calligraphic letters  
'Another cancelled': Valid, has it's own Proctor # and copy specific info. There's also a copy before this that's just "A cancelled copy", but there's only one occurence of this.  
'A cancelled': Valid, see above entry.  
'Another closely': Not valid, describes another edition that's similar  
'Another compartment': Not valid, part of the information rather than about another copy  
'Another copy': Valid  
'Another copy,': Subset of Another copy  
'Another copy.': Subset of Another copy  
'Another cut': Not valid  
'Another edition': Valid  
'Another edition,': Subset  
'Another edition.': Subset  
'Another fragment,': Valid  
'Another full-page': Not valid  
'Another issue': Valid  
'Another issue,': Subset  
'Another issue.': Subset  
'Another metrical': Not valid  
'Another reading.': Not valid  
'Another recension': Not valid  
'Another setting': Not valid  
'Another setting-up': Subset  
'Another version: Not valid  


In [1]:
# Important 'Another' variants

another_variants = [
    'Another cancelled',
    'A cancelled',
    'Another copy',
    'Another edition',
    'Another fragment,',
    'Another issue'
]

Having a leading shelfmark is highly indicative of an 'Another copy' entry actually being another copy. Of course this relies on the shelfmark detection being accurate. In some cases this isn't so, see analysis below for efforts to improve Issac's shelfmark finding.

In [2]:
# TODO some of the shelfmarks are absent - instead the Another copy has it's location listed as "Print room"
# work out what to do with this

### Function Development

In [3]:
def extract_another_copy(row):
    """

    :return:
    """
    another_variants = [
        'Another cancelled',
        'A cancelled',
        'Another copy',
        'Another edition',
        'Another fragment,',
        'Another issue'
    ]
    
    match = []
    for v in another_variants:
        p = re.compile(v)
        m = p.finditer(row)
        if m:
            match += m
    
    if match:
        return match
    else:
        return None