N.B. Keep CSV updates separate to track errors.

CSV 1: Extract the content

In [None]:
# Import Libraries:
    # 1. Requests - for HTTP requests to fetch data from a URL
    # 2. ET - for parsing data from XML files
    # 3. csv - for reading and writing 
    # 4. re - for pattern matching
import requests
import xml.etree.ElementTree as ET
import csv
import re

# Base URL: needs to be replaced with ALTO-XML file link for each volume 
#           without the individual page - this will be iterated through later
# Headers: accepts XML content (this was an addition from ChatGPT following errors)
base_url = 'https://raw.githubusercontent.com/MoMu-Antwerp/melijn/main/altofiles/T94_192/alto/MOMU_T94_192_'
headers = {'accept': 'application/xml;q=0.9, */*;q=0.8'}

# Create a list to store the extracted transcription data:
all_entries = []

# Regular expression to pattern match the transcriptions for “Adij” (and HTR errors), the abbreviated form of 
# Anno Domini, which precedes entries and thus acts as a marker of new entries:
# N.B. Regular expression must be adapted according to volume.
regex_pattern = re.compile(r'^(adij|adj|adiy|ady|addyy|aeij|aey)', re.IGNORECASE)

# For-loop to iterate through the range of files in each volume, before constructing URL for each file by 
# appending the base URL with the file number:
for page_number in range(#set number of altofiles in folder):
    file_number = f'{page_number:04d}'
    url = f'{base_url}{file_number}.xml'

    # Use the contructued URLs to fetch the XML content:
    response = requests.get(url, headers=headers)

    # If-statement to ensure HTTP request was successful - this is important to monitor in automated iterations:
    if response.status_code == 200:
    
        # Parse the XML content:
        root = ET.fromstring(response.text)
    
        # Create variables to store the URL of the current file and the transcription details:
        current_filename = url
        current_entry_content = []
        first_hpos, first_vpos, first_width, first_height = None, None, None, None
        last_hpos, last_vpos, last_width, last_height = None, None, None, None
        first_textline_id = None

        # For-loop to iterate through each TextLine in the XML content and extract the content for above variables:
        for text_line in root.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}TextLine'):
            content = text_line.find('.//{http://www.loc.gov/standards/alto/ns-v4#}String').get('CONTENT', '')
            textline_id = text_line.get('ID', '')
            hpos = int(text_line.get('HPOS', 0))
            vpos = int(text_line.get('VPOS', 0))
            width = int(text_line.get('WIDTH', 0))
            height = int(text_line.get('HEIGHT', 0))
            
            # Skip empty lines:
            if content.strip() == '':
                continue

            # If the content matches the regular expression pattern indicating a new entry, 
            # start a new entry and append the created list:
            if regex_pattern.match(content):
                if current_entry_content:
                    all_entries.append({
                        'filename': current_filename,
                        'content': '\n'.join(current_entry_content),
                        'textline_id': first_textline_id,
                        'first_hpos': first_hpos,
                        'first_vpos': first_vpos,
                        'first_width': first_width,
                        'first_height': first_height,
                        'last_hpos': last_hpos,
                        'last_vpos': last_vpos,
                        'last_width': last_width,
                        'last_height': last_height
                    })
                    current_entry_content = []  

                first_hpos, first_vpos, first_width, first_height = hpos, vpos, width, height
                last_hpos, last_vpos, last_width, last_height = hpos, vpos, width, height
                first_textline_id = textline_id
            else:
                last_hpos, last_vpos, last_width, last_height = hpos, vpos, width, height

            current_entry_content.append(content)

        # Store the information from the current entry in the all_entries list:
        if current_entry_content:
            all_entries.append({
                'filename': current_filename,
                'content': '\n'.join(current_entry_content),
                'textline_id': first_textline_id,
                'first_hpos': first_hpos,
                'first_vpos': first_vpos,
                'first_width': first_width,
                'first_height': first_height,
                'last_hpos': last_hpos,
                'last_vpos': last_vpos,
                'last_width': last_width,
                'last_height': last_height
            })

    # If HTTP request was unsuccessful, return the page causing error to allow human inspection:
    else:
        print(f"Failed to retrieve the XML content for Page {page_number}. Status code: {response.status_code}")

# Create a new CSV file and add all collected entries to it, with columns for each type of data:
csv_filename = 'MoMu_T94192v2.csv' #replace with name of volume
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['filename', 'content', 'textline_id', 'first_hpos', 'first_vpos', 'first_width', 'first_height',
                  'last_hpos', 'last_vpos', 'last_width', 'last_height']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for entry in all_entries:
        writer.writerow(entry)

print(f"CSV file '{csv_filename}' has been created successfully.")

CSV 2: Identifying languages 

In [None]:
# Import Library (gcld3) for guessing languages:
!pip install gcld3
import gcld3

# Define a function to take text and detect its language (written with support from ChatGPT):
def detect_language_gcld3(text):
    try:
        detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
        result = detector.FindLanguage(text=text)
        lang_detected = result.language
        return lang_detected
    except Exception as e:
        print("An error occurred:", e)
        return None

# Load the CSV file into a DataFrame:
df = pd.read_csv(#path to CSV1)

# Create a language column by applying the function to the transcription:
df['language'] = df['content'].apply(detect_language_gcld3)

#Write the changes using a new CSV file name:
df.to_csv(#updated_csv, index=False)


CSV 3: Adding persistent identifiers (PID)

In [None]:
# Define the file paths:
melijn_csv = #path to file above
updated_csv = #new csv name
id_txt = #MelijnNOIDs.txt (text file of generated PIDs)
updated_ids = #updatedNOID txt filename

# Read the latest version of the CSV file and add each row to a new list:
existing_csv = []
with open(melijn_csv, 'r', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        existing_csv.append(row)

# Read the text file of PIDs and add the four-character ID to a list
with open(id_txt, 'r') as f:
    ids = [line.strip()[4:] for line in f.readlines()]

# Create a new CSV file to add the PID values:
with open(updated_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    # For-loop to iterate through each row in the CSV file:
    for i, row in enumerate(existing_csv):
        
        # If-statement to ensure there are enough IDs for each row
        if i < len(ids):
            
            # Add the new PID to the existing MoMu system and update the CSV file:
            new_id = "http://data.momu.be/ark:34546/m63xtm_" + ids[i]
            row.append(new_id)
        writer.writerow(row)

# Calculate the number of remaining unused IDs depending on the last used ID
# and write a new tect file saving the remaining IDs:
unused_ids = ids[len(existing_csv):]
with open(updated_ids, 'w') as f:
    for id_ in unused_ids:
        f.write("id: " + id_ + "\n")
        

# Update the CSV file with PIDs to rename the column for PIDs
df = pd.read_csv(updated_csv)
df.rename(columns={"http://data.momu.be/ark:34546/m63xtm_XXXXX": "pid"}, inplace=True)
#XXXXX needs to be replaced by first PID in textfile
df.to_csv(updated_csv, index=False)

Then used OpenRefine to do the following:
<li>Add resourceTemplate column with "Melijn entry" as content for all rows</li>
<li>Add title column with "Letter from Melijn archive" as content for all rows</li>
<li>Add dateCreated column using transcription date in volume manifest</li>
<li>Add description column based on altofile (filename) column using value.replace("raw_github_link up to alto/", "").replace(".xml", "")</li>
<li>Add partOf column using the Omeka ID for the volume as content for all rows</li>
<li>Add pageStart column based on altofile (filename) column using value.slive(value.lastIndexOf('_')+1, value.lastIndexOf('.'))</li>
<li>Add reviewStatus column with "unreviewed" as content for all rows</li>
<li>Use text facet to check irregularities in language column</li>


CSV 4: Adding Universal Viewer link

In [None]:
# Read the updated CSV from the OpenRefine edits:
df = pd.read_csv(#filepath)

# Define a function to extract the canvas view value depending on the page number:
def extract_cv(pages):
    
    # Convert the page number into an integer and extract the last four digits:
    cv_value = int(pages) % 10000
    
    # If-statement to avoid invalid numbers, otherwise minus one, as the number in the link is one less 
    # than the file number:
    if cv_value == 0:
        cv_value = 9999
    elif cv_value > 0:
        cv_value = cv_value-1 
    return cv_value

# Apply the function to the page number column:
df['cv'] = df['pageStart'].apply(extract_cv)

# Define a function to construct a link to the universal viewer using each page's CV number:
def construct_link(cv):
    return f"https://universalviewer.io/uv.html?manifest=https://museumstichting.resourcespace.com/iiif/11524/manifest#?c=0&m=0&s=0&cv={cv}&xywh=-1391%2C0%2C10494%2C4670"
    # Link needs to be replaced by the correct universal viewer link for that volume. 
    # Ensure that "cv={cv}" is intact

# Apply the function to each CV value in the DataFrame and add the link under a new column:
df['edm:isShownBy'] = df['cv'].apply(construct_link)

df.to_csv(, index=False)

CSV 5: Adding IIIF manifest information

In [None]:
# Read the updated CSV 4:
df = pd.read_csv(#filepath)

manifest_url = #raw json manifest link from the file

# Fetch the manifest from the URL link:
response = requests.get(manifest_url)
if response.status_code == 200:
    manifest = response.json()
else:
    print("Failed to retrieve the IIIF manifest.")
    exit()

# Define a function to extract the IIIF ID for each canvas (page) number by retrieving the canvas number 
# and constructing the IIIF ID:
def extract_iiif_id(canvas_number):
    canvas = manifest['sequences'][0]['canvases'][canvas_number]
    thumbnail_service_id = canvas['thumbnail']['service']['@id']
    iiif_id = thumbnail_service_id + "/info.json"
    return iiif_id

# Create a list to store IIIF ID:
iiif_ids = []

# For-loop to iterate through each row to get the canvas number:
for index, row in df.iterrows():
    page_number = row['pagestart'] - 1  
    
    # Apply the function to get the IIIF ID and append the IIIF ID list:
    iiif_id = extract_iiif_id(page_number)
    iiif_ids.append(iiif_id)

# Add a new column to the DataFrame and create a new CSV file:
df['IIIF_id'] = iiif_ids
output_csv_filename = #new filename
df.to_csv(output_csv_filename, index=False)