In [1]:
import pandas as pd
import glob
import os
import xml.etree.ElementTree as ET
import re

# Load Data

The BVB export contains a lot of additional information that can be useful

In [5]:
# Full bvb export
path = '../data/hertziana_exp/b3kat_export/'
files = glob.glob(os.path.join(path, '*.csv'))

li = []

for filename in files: 
    frame = pd.read_csv(filename, header=0)
    li.append(frame)

df_full = pd.concat(li, axis=0, ignore_index=True)

# No need for all the columns

df_full= df_full[['id', 'lang', 'authors', 'parentId']]

# drop duplicated ids

df_full = df_full.drop_duplicates(subset=['id'])

  frame = pd.read_csv(filename, header=0)


In [6]:
df_full

Unnamed: 0,id,lang,authors,parentId
0,BV037157247,ger,,
1,BV037157467,ger,"Seybold, Dietrich",BV035421022
2,BV037157474,swe,,
3,BV037157481,ger,,BV040272353
4,BV037157488,ger,,BV040272353
...,...,...,...,...
1185197,BV019670717,ger,,BV019337850
1185198,BV019670843,ita,,
1185199,BV019670978,eng,"Stremitzer, Alexander",BV019734490
1185200,BV019671499,ita,,


df_sig contains the list of signatures and corresponding textual columns decoding the meaning of each signature

In [3]:
# signatures
df_sig = pd.read_csv('data/csv/sig_lookup.csv')

df_sig = df_sig[['sys', 'text', 'text_3', 'text_2', 'text_1']]

  df_sig = pd.read_csv('data/csv/sig_updated.csv')


df_freihand contains the list of documents physically present in the Bibliotheca Hertziana

In [4]:
df_freihand = pd.read_csv('data/Freihand.csv', sep=';', header=0, 
                          names = ['bvb', 'title', 'year', 'signature', 'collection', 'inventory_nr', 'inventory_date'])

In [5]:
print(df_freihand.columns)
print(df_full.columns)
print(df_sig.columns)

Index(['bvb', 'title', 'year', 'signature', 'collection', 'inventory_nr',
       'inventory_date'],
      dtype='object')
Index(['id', 'lang', 'authors', 'parentId'], dtype='object')
Index(['sys', 'text', 'text_3', 'text_2', 'text_1'], dtype='object')


In [6]:
df_sig = df_sig.fillna('')
df_freihand = df_freihand.fillna('')
df_full = df_full.fillna('')

### Merging Signatures and Books

In this part,the signature of each document in the library is matched with the decoded signatures. To get the labels for the documents, the signature has to be 'decoded', which has been done in the 'signatures_processing.ipynb'. This way, for each document, 3-4 textual columns describing the subject matter of the document can be extracted and later on be used for dimensionality reduction. 

In [7]:
# Extracting the known signature types

# these patterns are the ones that can be decoded so far
patterns = [
    #Generic patter e.g. Mk 1000
    r'^([a-zA-Z]{1,2}) (\d+)',
    # Artists and other People, e.g. Ca-BER 1920
    r'^[CWZ][amopsu]-[A-Z]{3}\s\d{1,4}',
    # Topography, e.g. E-BOL 60
    r'^[EXY][a-z]?-[A-Z]{3}\s\d{1,4}'
]

# Extracting the known patterns from the signatures, otherwise None

def clean_signature(signature):
    match = None
    for pattern in patterns:
        match = re.search(pattern, signature)
        if match:
            break
    
    return match.group() if match else None

df_freihand['signature_clean'] = df_freihand['signature'].apply(clean_signature)

# Merging the library content and the decoded signatures on the signatures

merged = pd.merge(df_freihand, df_sig, left_on='signature_clean', right_on='sys', how= 'outer', indicator=True)

# both contains the intersection of the two, meaning documents with signatures that can be decoded

both = merged.loc[merged['_merge'] == 'both']
both.drop('_merge' , axis=1, inplace=True)

# percentage of documents with recognized signatures

rec_sigs = len(both) / len(df_freihand) * 100
print(f'Books in Freihand matched with signature: {rec_sigs:.1f} %')

# signatures that can't be decoded are marked with left_only

left_only = merged.loc[merged['_merge'] == 'left_only']

# Keeping this to check which ones are still missing

left_only.groupby(left_only.signature[:6]).size().to_csv('data/left_only.csv')


Books in Freihand matched with signature: 74.4 %


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  both.drop('_merge' , axis=1, inplace=True)


In [8]:
# Merging on the BVB export to get more metadata

merged = pd.merge(both, df_full, left_on='bvb', right_on='id', how= 'left')

In [9]:
#Joining each string in the text columns with '_' to treat them as tokens later on
text_cols = ['text', 'text_1', 'text_2', 'text_3']
for col in text_cols: 
    merged[col] = merged[col].str.replace(' ', '_')

# Acquisition date

In [11]:
# Regular expression pattern to match the year (four digits, starting with 19 or 20 etc.)
year_pattern = r"\b(19[0-9]{2}|20[0-2][0-9])\b"

merged['inventory_year'] = merged['inventory_nr'].apply(lambda x: re.search(year_pattern, str(x)).group(0) if re.search(year_pattern, str(x)) else None)

# Signature levels


In [12]:
#Extracting the first and second letter of each signature to later use in the dimensionality reduction step

merged['sig_lev_1'] = merged.signature_clean.str[0]
merged['sig_lev_2'] = merged.signature_clean.str[:2]

In [13]:
# Export

export_columns = ['bvb', 'title', 'signature', 'inventory_nr','inventory_date', 'inventory_year', 'lang', 'year', 'sys', 'text', 'text_1', 'text_2', 'text_3', 'sig_lev_1', 'sig_lev_2']

merged[export_columns].to_csv('data/csv/freihand_signatures.csv', index=False)