# *MunchXMLmuncher* **1.2.0**

## CMIF Metadata

In [16]:
# XML, TEI, CMI/F and data handling
from bs4 import BeautifulSoup # Hent BeautifulSoup-modulen (https://www.crummy.com/software/BeautifulSoup/) for XML
from bs4 import Comment # BS4-addon for å håndtere kommentarer <!-- X -->
import re # Regex
import pandas as pd
import collections # Facilitate dynamic dict

# Time and date
import datetime # Dates
from datetime import date
import time # Time

# File and folder handling
import glob # The yeast of thought and mind
import os # Filsystem; mapper, lagring, åpning, etc...
import shutil # Se os+

from string import punctuation # Useful library for strings that made me scream

import configparser # Used to easily get statements from the config file

In [21]:
version = "1.2.0" # Describes the "program's" state of completion and versioning.

config = configparser.ConfigParser()
config.read("config.ini")
cmifTitle = config.get("statements", "cmifTitle")
editorName = config.get("statements", "editorName")
editorMail = config.get("statements", "editorMail")
cmifUid = config.get("statements", "cmifUid")
publisherURL = config.get("statements", "publisherURL")
publisherName = config.get("statements", "publisherName")
cmifURL = config.get("statements", "cmifURL")
typeOfBibl = config.get("statements", "typeOfBibl")
publicationStatementFull = config.get("statements", "publicationStatementFull")
cmifTitle += " "+str(version) # Title of resulting CMIF

### Init, metadata, etc

In [8]:
today = date.today() # Sett dato i dag
today = today.strftime("%Y-%m-%d") # Formater dato
currVer = version+" "+today

previouslyRun = "Last executed code was version "+str(currVer)+". All OUTPUT files are current to that version on that date.\n"+str(cmifUid)+"."
print("Version",currVer)

Version 1.2.0 2022-10-18


In [41]:
if os.path.exists("preprocessed.csv"):
    print("MxmlM located and will use preprocessed data (dates, places).")
    flagPreprocessor = True # We're using preprocessed data
    dfPP = pd.read_csv("preprocessed.csv",sep=",").fillna("N/A") # Fill up NaN with N/A
    
    augments = collections.defaultdict(dict) # New dict to hold the values
    
    ppdocs = [] # Simple list of documents that have additional information from preprocessing
    docIDs_placenames = [] # Quickly identify what docids are getting new places
    docIDs_singledates = [] # Quickly identify what docids are getting new SINGLE dates
    docIDs_fromtodates = [] # Quickly identify what docids are getting FROM-TO RANGE dates
    
    for doc in dfPP['document']:
        ppdocs.append(doc) # Populate list of docs that are getting augmented
        
    
    for idx,row in dfPP.iterrows():
        dat = row['date']
        loc = row['location']
        doc = row['document']
        if dat == "N/A":
            # if date is N/A, we're not doing anything with it
            pass
        elif "%" in dat:
            # Date range - IMPORTANT split the date into two parts if there's a separator (%)
            docIDs_fromtodates.append(doc)
        else:
            docIDs_singledates.append(doc)
        if loc == "N/A":
            pass
        else:
            docIDs_placenames.append(doc)
        augments[doc]['sender'] = loc
        augments[doc]['date'] = dat
else:
    print("WARNING MxmlM is running WITHOUT preprocessed data (dates, places)!")
    flagPreprocessor = False

MxmlM located and will use preprocessed data (dates, places).


In [46]:
print(len(docIDs_fromtodates),len(docIDs_singledates))

60 2594


## Program
### Setup

In [None]:
hasCorrespondenceXML,hasTEIXML = False,False
programfolder = "MXMLM "+currVer
if os.path.exists(programfolder):
    print("Found directory '% s'" % programfolder)
else:
    os.mkdir(programfolder)
    print("Directory '% s' created" % programfolder)
inputfolder = os.path.join(programfolder,"sourcefiles") # The folder containing the TEI/XML-files to be transformed.
paths = ['correspondence.xml','register_tei.xml']
if not os.path.exists(inputfolder):
    os.mkdir(inputfolder)
    print("Directory '% s' created" % inputfolder)
outputfolder = os.path.join(programfolder,"output") # Output folder
if not os.path.exists(outputfolder):
    os.mkdir(outputfolder)
    print("Directory '% s' created" % outputfolder)
    

    
for item in paths:
    if os.path.isfile(item):
        a = os.path.getmtime(item)
        if os.path.isfile(inputfolder+"/"+item):
            b = os.path.getmtime(inputfolder+"/"+item)
            print("Found existing instance of",inputfolder+"/"+item)
            if a>b:
                shutil.copy2(item, inputfolder+"/"+item)
                print("\tReplaced older version of",item,"in",inputfolder,"(file's last modified date difference is +"+str(a-b)+str(")"))
            else:
                print("\tUsing existing version of",item,"(file's last modified date difference is",str(a-b)+str(")"))
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
        else:
            shutil.copy2(item, inputfolder+"/"+item)
            print("\tYoink! Copied",item,"to",inputfolder)
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
    else:
        if os.path.isfile(inputfolder+"/"+item):
            print("\tFound existing file",item,"in the sourcefiles directory, but not in the main directory.")
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
        else:
            print("\nWARNING Didn't find",item,"in any of the working directories!\n")
filesForMunching = glob.glob(inputfolder+"/*")
if len(filesForMunching) == 0:
    print("Stop! You need to put some file/s (correspondence.xml,register_tei.xml) in the sourcefiles folder for me to eat!")
    raise KeyboardInterrupt
elif hasTEIXML == False and hasCorrespondenceXML == False:
    print("Critical error! I didn't find either of the correspondence.xml OR register_tei.xml files. Means I don't have anything to munch!")
    raise KeyboardInterrupt
else:
    print("\nSummary:")
    for name in filesForMunching:
        if name == inputfolder+"\\correspondence.xml" or name == inputfolder+"\\register_tei.xml":
            print('\t'+name,'will be used')
        else:
            print("\tWARNING Detected unusual file!",name,"may not be a file I can munch!")

In [None]:
testFilePath = ["MXMLM 1.b.2 2022-10-04\sourcefiles\correspondence.xml"]
with open(inputfilepath, "r", encoding="utf-8") as file: # Open a file
    tei = file.readlines() # Les innholdet som linjer
    tei = "".join(tei) # Linjene blir kombinert i en variabel
soup = BeautifulSoup(tei, from_encoding="UTF-8",features="xml") # It is now soup

In [None]:
soup.find(attrs={'xml:id':"No-MM_K2564"})

### Read/process TEI-XML

In [None]:
# Create CMIF boilerplate object
CMIFstring = '<?xml-model href="https://raw.githubusercontent.com/TEI-Correspondence-SIG/CMIF/master/schema/cmi-customization.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>'+str(cmifTitle)+'</title><editor>'+str(editorName)+'<email>'+str(editorMail)+'</email></editor></titleStmt><publicationStmt><publisher><ref target="'+str(publisherURL)+'">'+str(publisherName)+'</ref></publisher><idno type="url">'+str(cmifURL)+'</idno> <date when="'+str(today)+'"/><availability><licence target="https://creativecommons.org/licenses/by/4.0/">This file is licensed under the terms of the Creative-Commons-License CC-BY 4.0</licence></availability></publicationStmt><sourceDesc><bibl type="'+str(typeOfBibl)+'" xml:id="'+str(cmifUid)+'">'+str(publicationStatementFull)+'</bibl></sourceDesc></fileDesc><profileDesc><dummy/></profileDesc></teiheader><body><p/></body></text></tei>'
CMIF = BeautifulSoup(CMIFstring,"xml") # Read as XML, not HTML

In [None]:
start = time.time()

# DEBUGGER LISTS
documentsWithBadDates = []
documentsWithNoAuthor = []
documentsWithNoRecipient = []
documentsLackingTargetReference = []

# Declare variables
# ---------------------------------------------------------------
noOfLettersToMunch = 0
changedDatesList,addedPlaceList = [],[]
errors_found = [] # List of errors found during execution
letterCount = 0 # # letters, as defined by an item having a recipient, processed
miscCount = 0 # non-letter documents, as defined by an item having no recipients, processed
addresseesUnique = [] # List of unique recipients
datetype = 0 # Var for the type of date we're dealing with
noOfRecipients = 0 # Counting non-unique recipients
otherMiscDocCount = 0 # Counting objects that are not letters.
authorID = "" # Reserved for VIAF etc.
# ---------------------------------------------------------------
iii = 0
#for path in glob.glob(inputfolder+"/*.xml"):
for path in testFilePath:
    inputfilepath = path
    with open(inputfilepath, "r", encoding="utf-8") as file: # Open a file
        tei = file.readlines() # Les innholdet som linjer
        tei = "".join(tei) # Linjene blir kombinert i en variabel
    soup = BeautifulSoup(tei, from_encoding="UTF-8", features="xml") # It is now soup
    # Don't worry about the error screaming about Unicode markup being provided twice
    print("Souped",inputfilepath)
    # Before handling the data: remove all comments
    # Making a list of <!--comments--> to be destroyed...
    commentDocs = 0 # Used only in terminating comments
    comments = 0 # Used only in terminating comments
    for comment in soup.findAll(string=lambda text: isinstance(text, Comment)):
        if "xml:id=\"" in comment:
            commentDocs+=1
        comment.extract()
        comments+=1
    if comments > 0:
        print("Destroyed",comments,"<!--comments-->, of which",commentDocs,"contained an @XML:ID.")
    # ... and checking it twice.
    comments = soup.findAll(string=lambda text: isinstance(text, Comment))
    if comments:
        print("There are still",len(comments),"comments present.")
    else:
        print("All comments destroyed.")
    # Limit workspace to individual div (document) here.
    profileDescElement = CMIF.find('profileDesc') # Target correspondence wrapper
    # For each Div element with an XML:ID (should be each document)
    for document in soup.findAll("div", {"xml:id":True}):
        #print ("\r","Progress:",round(i/len(listXMLfiles)*100),"%", end='')
        # Get the document ID from the <div> element.
        
        authorID,authorName,recipient,recipientID = False,False,False,False # debug
        
        # Look for the document type assignment.
        documentType = document.find("list", {"type" : "objectType"}).findChild(True, recursive=True)#.attrs['n']
        if "brev" in documentType or "letter" in documentType: # Checks if the words "letter" or "brev" appear in the type
            # This code applies to letters as directed by the data type.
            documentID = list(document.attrs.values())[0]
            #print(documentType)
            #print("DEBUG Checking",documentID)
            # Check if the document has more than 0 recipients. If there are no recipients, there is no correspAction required.
            

            # Check if the document has an author.
            if "correspondence" in path:
                authorNameList = document.find("item", {"n":"sender"}).findChildren(True, recursive=True)
                ji=0
                for name in authorNameList:
                    authorName = authorNameList[ji]
                    #authorName = document.find("item", {"n":"sender"})
                    try:
                        targetRef = authorName['target']
                    except:
                        targetRef = "NONE"
                        if documentID not in documentsLackingTargetReference:
                            documentsLackingTargetReference.append(documentID)
                    ji+=1
                recipient = "Edvard Munch"
                
            else:
                authorName = document.find("item", {"n":"author"})
                recipient = document.find("item", {"n":"recipient"})
            if authorName:
                #print(authorName)
                try:
                    authorName = authorName.contents[0]
                except:
                    authorName = "No author"
                    print("WARNING:",documentID,"suffered code 201881X - no author!")
                    errors_found.append("INFO 201881 in "+str(documentID))
                    documentsWithNoAuthor.append(documentID)
            else:
                authorName = "No author"
                print("WARNING:",documentID,"suffered code 201881 no author found!")
                errors_found.append("INFO 201881 in "+str(documentID))
                documentsWithNoAuthor.append(documentID)
            if authorName == "Edvard Munch":
                authorID = "https://viaf.org/viaf/61624802/"
            else:
                authorID = targetRef

            # Attempt to divine the date or date range of the document. Assumes that each document only has 1 date (or 1 range).
            
            ## "BUG" in version Pre-release 1.0:
            ## Dates in correspondence.xml cause error cascades here.
            isDocumentUndated = document.find("item", {"n":"undated"})
            if isDocumentUndated:
                date = "s.d."
                datetype = "none"
            else:
                isDocumentFromTo = document.find("date", {"from":True}) # Does the date element have a from assignment? 
                # Using "from" because PN1350 does not have a fromTo attr despite using fromTo. Uses "from", though. Works fine.
                if isDocumentFromTo: # If it does, and thus has a range (JK, No-MM_T1296 has FROM attr but not a TO attr.)
                    doesDocumentHaveToDate = document.find("date", {"to":True})
                    if doesDocumentHaveToDate:
                        fromDate = isDocumentFromTo['from'] # Extract 'from' date. 
                        #date = " ".join(date)
                        toDate = isDocumentFromTo['to'] # Extract 'to' date.
                        datetype = "range"
                    else:
                        date = isDocumentFromTo['from']
                        fromDate = isDocumentFromTo['from']
                        datetype = "fromRange"
                        if documentID not in documentsWithBadDates:
                            documentsWithBadDates.append(documentID)

                else: # If it doesn't:
                    yearSent = document.find("date", {"type":"year","when":True}) # Check for year element
                    monthSent = document.find("date", {"type":"month","when":True}) # Check for month element
                    daySent = document.find("date", {"type":"day","when":True}) # Check for day element
                    if yearSent:
                        datetype = "exact"
                        date = yearSent.attrs["when"]
                        if monthSent: # Only look for a month if there's a year. That 1 letter with just month/day, tho...
                            M = re.sub('[-]', '', monthSent.attrs["when"]) # Strip the random '-' characters in here.
                            date+="-"+str(M) # Join month to year by YYYY-MM.
                            if daySent: # Only applies if there is a month AND a day. No point having a day if you don't have a month.
                                M = re.sub('[-]', '', daySent.attrs["when"]) # Strip the random '-' characters in here, too.
                                date+="-"+str(M) # Join month to year-month by YYYY-MM-DD.
                    else: # If it doesn't have a year, make one last check
                        doesDocumentHaveToDate = document.find("date", {"to":True}) # if the date just has a to date...
                        if documentID not in documentsWithBadDates:
                                documentsWithBadDates.append(documentID)
                        if doesDocumentHaveToDate:
                            datetype = "toRange"
                        else:
                            print("WARNING:",documentID,"suffered code 301881 - no year found in a specific-year element. Expected in MM_N1071 and MM_N3734.")
                            errors_found.append("INFO 301881 in "+str(documentID))
                            datetype = "none"
                            date = "s.d."
                            #print("WARNING:",documentID,"suffered code 301881 - no year found in a specific-year element. Expected in MM_N1071 and MM_N3734.")
                            #errors_found.append("INFO 301881 in "+str(documentID))

            # Construct CMIF author ("sent") element
            correspDescElement = soup.new_tag("correspDesc", attrs={"key":str(documentID), "ref":"https://www.emunch.no/HYBRID"+str(documentID)+".xhtml", "source":cmifUid})
            profileDescElement.append(correspDescElement)
            if "correspondence" in path:
                for name in authorNameList:
                    targetElementCorrespDesc = CMIF.find("correspDesc", attrs={"key":str(documentID)})
                    correspActionElement = soup.new_tag("correspAction", attrs={'type':'sent'})
                    targetElementCorrespDesc.append(correspActionElement)
                    correspActionTarget = targetElementCorrespDesc.findChild("correspAction",attrs={"type": "sent"}, recursive=False)
                    if authorID != "NONE":
                        persNameElement = soup.new_tag("persName", attrs={"ref":authorID})
                    else:
                        persNameElement = soup.new_tag("persName")
                    persNameElement.string = str(authorName)

                    correspActionTarget.append(persNameElement)
            else:
                targetElementCorrespDesc = CMIF.find("correspDesc", attrs={"key":str(documentID)})
                correspActionElement = soup.new_tag("correspAction", attrs={'type':'sent'})
                targetElementCorrespDesc.append(correspActionElement)
                correspActionTarget = targetElementCorrespDesc.findChild("correspAction",attrs={"type": "sent"}, recursive=False)
                if authorID != "NONE":
                    persNameElement = soup.new_tag("persName", attrs={"ref":authorID})
                else:
                    persNameElement = soup.new_tag("persName")
                persNameElement.string = str(authorName)

                correspActionTarget.append(persNameElement)
            
            
            # PLACENAME AND DATETIME AUGMENTATION HERE
            if flagPreprocessor is True:
                if documentID in ppdocs: # If docID is flagged for update
                    senderLoc = placenamedict[documentID].get('sender')
                    if senderLoc:
                        senderLocEle = soup.new_tag("placeName", attrs={"ref":"PLACEIDREF"+str(senderLoc)}) # Create place element
                        senderLocEle.string = str(senderLoc) # Give it a string value (placename)
                        correspActionTarget.append(senderLocEle) # Append the new element to the correspAction element
                        addedPlaceList.append(documentID)
                    senderDate = placenamedict[documentID].get('date')
                    if senderDate:
                        # Overwrite existing data, if any
                        datetype = "exact"
                        date = senderDate
                        changedDatesList.append(documentID)
                    else:
                        senderFromDate = placenamedict[documentID].get('fromDate')
                        senderToDate = placenamedict[documentID].get('toDate')
                        if senderFromDate and senderToDate:
                            datetype = "range"
                            fromDate = senderFromDate
                            toDate = senderToDate
                            changedDatesList.append(documentID)
                            #print(senderFromDate,senderToDate) # Modify the date (range) here
                        else:
                            pass
            
            if datetype == "exact":
                dateSentElement = soup.new_tag("date", attrs={"when":date})
                #print(datetype,date)
            elif datetype == "range":
                dateSentElement = soup.new_tag("date", attrs={"from":fromDate,"to":toDate})
                #print(datetype,fromDate,toDate)
            elif datetype == "fromRange":
                dateSentElement = soup.new_tag("date", attrs={"from":fromDate})
            elif datetype == "toRange":
                dateSentElement = soup.new_tag("date", attrs={"to":fromDate})
                #print(datetype,fromDate)
            elif datetype == "none":
                #print("> NO DATE!",documentID)
                pass
            else:
                print("ERROR 2839 - Unrecognized datetype!")
                errors_found.append("2839")
            if datetype == "none":
                pass
            else:
                # Append date element to correspAction @sent
                correspActionTarget.append(dateSentElement)



            if recipient: # If there are more than 0 recipients:
                letterCount += 1
                i=0
                if recipient == "Edvard Munch":
                    recipientID = "https://viaf.org/viaf/61624802/"
                    recipientType = "persName"
                    recipientName = recipient
                    noOfRecipients += 1
                    noOfLettersToMunch += 1
                    if recipientName not in addresseesUnique:
                        addresseesUnique.append(recipientName)
                    correspActionElement = soup.new_tag("correspAction", attrs={'type':'received'})
                    targetElementCorrespDesc.append(correspActionElement)
                    correspActionTarget = targetElementCorrespDesc.findChildren("correspAction",attrs={"type": "received"}, recursive=False)

                    persNameElement = soup.new_tag("persName", attrs={"ref":recipientID})

                    persNameElement.string = str(recipientName)
                    correspActionTarget[i].append(persNameElement)
                    
                else:
                    recipientList = recipient.findChildren(True) # Get ALL children of the recipient item element. Might be 2+!
                    for each in recipientList: # For every recipient:
                        recipientName = str(each.contents[0]) # Assign a name
                        noOfRecipients += 1
                        if recipientName not in addresseesUnique:
                            addresseesUnique.append(recipientName)
                        recipientID = recipientList[i].attrs["target"] # Assign an ID

                        if "institution" in recipientID:
                            recipientType = "orgName"
                        elif "person" in recipientID:
                            recipientType = "persName"
                        else:
                            print("WARNING:",documentID,"suffered error 20191. Defaulted to person.")
                            recipientType = "persName"
                            errors_found.append("WARNING 20191 in "+str(documentID))

                        correspActionElement = soup.new_tag("correspAction", attrs={'type':'received'})
                        targetElementCorrespDesc.append(correspActionElement)
                        correspActionTarget = targetElementCorrespDesc.findChildren("correspAction",attrs={"type": "received"}, recursive=False)

                        if recipientType == "persName":
                            persNameElement = soup.new_tag("persName", attrs={"ref":recipientID})
                        elif recipientType == "orgName":
                            persNameElement = soup.new_tag("orgName", attrs={"ref":recipientID})

                        persNameElement.string = str(recipientName)
                        correspActionTarget[i].append(persNameElement)
                        i+=1
            else: # If document does not have a recipient, what do we do?
                miscCount+=1
                if documentID not in documentsWithNoRecipient:
                    documentsWithNoRecipient.append(documentID)
        else:
            otherMiscDocCount += 1
        #iii+=1
        #if iii > 30:
         #   raise KeyboardInterrupt
            #print("Skipped item",documentID,"as it is not a letter.")
    #print("</profileDesc>")
end = time.time()

In [None]:
for document in documentsWithNoAuthor:
    erasing = CMIF.find("correspDesc",attrs={"key":document})
    print("Decomposing",erasing)
    erasing.decompose() # Obliterate items with no author - this is an err
erasing = CMIF.find("dummy")
erasing.decompose()

In [None]:
print(CMIF)

In [None]:
print("Processed",otherMiscDocCount+letterCount,"documents.",str(letterCount)+"("+str(round(letterCount/(otherMiscDocCount+letterCount)*100))+"%) were letters addressed to "+str(noOfRecipients)+" recipients, of which "+str(len(addresseesUnique))+" were unique (meaning each person received avg. "+str(round(letterCount/len(addresseesUnique)))+" letters), and",miscCount,"letters without recipients (if this > 0, there's a problem) in",round(end - start,1),"seconds.")
#print("The register file had 5443 documents of which 2711 were letters. Munch received",noOfLettersToMunch,"letters according to my script.")
if len(errors_found) > 0:
    i = 0
    print("\n"+str(len(errors_found)),"data warnings and errors, listed as INFO, WARNING, and ERROR in order of severity:")
    for error in errors_found:
        i+=1
        if "201881" in error:
            print(i,error,"\n\tDocument has no author. Registered as \"No author\".")
        elif "301881" in error:
            print(i,error,"\n\tDocument has a specific date type, but does not specify or suggest a year (MM-DD/MM). Document has been given \"undated\" status.")
        elif "301882" in error:
            print(i,error,"\n\tCatastrophic failure in date format or harvesting. The script was not designed for this.")
        elif "30190" in error:
            print(i,error,"\n\tCatastrophic failure in recipient list processing. I don't think the script will run to this point with such an error.")
        elif "20191" in error:
            print(i,error,"\n\tThe recipient is not a person or an organization. Suggests error in reference XMLURI. Defaulted to person.")
        else:
            print("There is an error that is not indexed. :(")
        print("\n")
else:
    print("No warnings or errors found.")
print("Saving to disk.")
start = time.time()
with open(outputfolder+"\CMIF_Unified.xml", "w", encoding="utf-8") as output_file:
    output_file.write(CMIF.prettify())
end = time.time()
print("Prettified CMIF file created in",round(end - start,1),"seconds.")
print("Process complete.")