# Data processor for MXMLM v2
Please read the readme.

In [None]:
# XML, TEI, CMI/F and data handling
from bs4 import BeautifulSoup # Hent BeautifulSoup-modulen (https://www.crummy.com/software/BeautifulSoup/) for XML
from bs4 import Comment # BS4-addon for å håndtere kommentarer <!-- X -->
import re # Regex
import pandas as pd
import collections # Facilitate dynamic dict

# Time and date
import datetime # Dates
#from datetime import date
import time # Time

# File and folder handling
import glob # The yeast of thought and mind
import os # Filsystem; mapper, lagring, åpning, etc...
#import shutil # Se os+
import json # JSON!

from string import punctuation

#Check if *keys (nested) exists in dict
def keys_exists(element, *keys):
    if not isinstance(element, dict):
        raise AttributeError('keys_exists() expects dict as first argument.')
    if len(keys) == 0:
        raise AttributeError('keys_exists() expects at least two arguments, one given.')

    _element = element
    for key in keys:
        try:
            _element = _element[key]
        except KeyError:
            return False
    return True
# *MunchXMLmuncher* **X2**
# MXMLMx2 reads and preprocesses files. First off, create subdirectories and locate our files:

In [None]:
runFullProcessor = 1 # Set 0 to only run chrono and placename processing.

In [None]:
# Chrono
# Next, let's get the chrono if it exists, and then read the data out of it to a dict.

## Chronology v2
IMPORTANT. v2 **requires** modifications to the chronology file. All MM N/K/T objects must be replaced by No-MM_N/K/T. All objects must have excess spaces removed. PN objects are permitted to be formatted without leading zeroes following the prefix (PN99 will be read as PN0099). Do not under any circumstance attempt to use this script with the vanilla "MM N 188" format.

In [None]:
hasChrono,hasXMLs = False,False
listofbaddies = []
lookupChrono = sorted(glob.glob("*Kronologi_Munchs_brev*.xlsx"), key=os.path.getmtime)
x = len(lookupChrono)-1
if x > -1:
    print("Newest chronology file:",lookupChrono[x])
    chronologyFile = lookupChrono[x]
    hasChrono = True
    #shutil.copy2(chronologyFile, inputfolder+"/kronologi.xlsx")
    #chronologyFile = inputfolder+"/kronologi.xlsx"
    
    chronology = pd.read_excel(chronologyFile).dropna(axis=1, how='all').dropna(axis=0, how='all').reset_index(drop=True)
    chronology = chronology.fillna("N/A")
    
    try:
        if CHRONODICT:
            print("CHRONODICT found with content")
        else:
            print("CHRONODICT found without content")
    except:
        CHRONODICT = collections.defaultdict(dict)
        print("CHRONODICT created")
    for idx,row in chronology.iterrows():
        mismatch = False
        formattingError = False
        document = chronology.iloc[idx]['Objektnr.']
        rawdate = chronology.iloc[idx]['Dato']
        #print("Document:",document)
        if rawdate != "N/A" and document != "N/A": # If date and documents are not N/A
            #print("\thas date and ID")
            document = document.replace(" ","")
            # This section checks PN objects for compliance. Compliant PN objects have the PN prefix followed by 4 digits, total 6 chars.
            # In cases where the object ID is too short, zeroes (0) are added immediately after the PN prefix until it complies.
            # In cases where the object ID is too long, characters are removed from the end of the ID until it complies.
            # This script will function until PN object IDs exceed 9999, meaning that there are 7 characters in the PN series instead of 6.
            chkStr = document[0:2]
            if chkStr == "PN":
                # If the characters after the prefix are not ALL numeric, the object is skipped entirely.
                if len(document) < 6:
                    while len(document) < 6:
                        document = document[0:2]+'0'+document[2:len(document)]
                    #print("Extended",document)
                elif len(document) > 6:
                    document = document[0:6]
                    #print("Reduced",document)
            # This section checks No-MM_N/K objects for compliance. Compliant objects have the No prefix and a total of 11 characters.
            # Procedure is identical to PN objects.
            elif chkStr == "No":
                
                #if document[7:len(document)].isnumeric():
                if len(document) < 11:
                    while len(document) < 11:
                        document = document[0:7]+"0"+document[7:len(document)]
                   # print("Extended",document)
                elif len(document) > 11:
                    if "," in document: # If you put a comma in the documentID, just remove EVERYTHING to the right of the first.
                        splitD = document.split(",")
                        document = splitD[0]
                    #print(document[7])
                    # Required to fix No-MM_N03101 and similar.
                    if document[7] == "0":
                        while len(document) > 11 and document[7] == "0":
                            document = document[0:7]+document[8:len(document)] # Discards characters after prefix
                    document = document[0:11] # Discards last n characters until 11 remain
                #print("\t",document)
        # If the last 4 characters are not ALL numeric, the object is skipped entirely. 
            if document[len(document)-4:].isnumeric() == False:
                formattingError = True # Formatting error due to invalid filename.
                listofbaddies.append(document)
                print(document,"is not a valid document ID and was excluded.")
        # If the document ID somehow is not 11 or 6 characters long, it is skipped entirely.
            elif len(document) != 11 and len(document) != 6: # If string doesn't match with No-MM_N0000 or PN0000
                #filenamePlain = "Formatting error"+filenamePlain # it is invalid.
                formattingError = True # Formatting error due to invalid filename.
                listofbaddies.append(document)
                print(document,"is not a valid document ID and was excluded.")
        # If the document ID is 11 or 6 characters long and the last 4 characters are numeric:
            else:
                #document = filenamePlain
                if isinstance(rawdate,datetime.date): # If it's just a datetime object
                    #dateobject = rawdate.strftime("%Y-%m-%d")
                    newdate = rawdate.strftime("%Y-%m-%d")
                else:
                    #print(document,rawdate)
                    dateobject = str(rawdate) # Make sure it's string

                    string4print = document+" "+dateobject

                    #dateobject = dateobject.replace("–","-") # Replace long dash – with a normal dash - doesn't work
                    if ".-" in dateobject: # Like: 04.-05.1922.
                        dateobject = dateobject.replace(".-","-")

                    if "?" in dateobject:
                        dateobject = dateobject.replace("?","").strip(punctuation) # Remove n ?s and then also remove excess .

                        dateobject = dateobject.replace("..",".")
                    
                    
                        
                    if "-" in dateobject:
                        splitToFrom = dateobject.split("-")
                        fromDate = splitToFrom[0]
                        toDate = splitToFrom[1]
                        
                        newFromDate = fromDate.split(".")
                        newToDate = toDate.split(".")
                        while ("" in newFromDate):
                            newFromDate.remove("")
                        while ("" in newToDate):
                            newToDate.remove("")
                        #Debug
                        
                        if len(newToDate) != len(newFromDate):
                            mismatch = True
                            print("Detected unbalanced date in",document,len(newFromDate),"vs",len(newToDate),newFromDate,newToDate)
                            if len(newToDate) > len(newFromDate):
                                itemsToGet = len(newToDate)-len(newFromDate)-1
                                #print("Items:",itemsToGet)
                                if itemsToGet == 1: # Get items 2 and 3
                                    while itemsToGet < len(newToDate):
                                        newFromDate.append(newToDate[itemsToGet])
                                        itemsToGet+=1
                                elif itemsToGet == 0: # Get last item
                                    newFromDate.append(newToDate[len(newToDate)-1])
                            else:
                                print("WARNING: Unable to resolve instances where From date is more specific than To date!")

                        if isinstance(newFromDate,list):
                            if len(newFromDate) > 1:
                                newdateF = newFromDate[len(newFromDate)-1]
                                for x in reversed(newFromDate):
                                    if len(x) == 4:
                                        pass
                                    else:
                                        if len(x) == 2:
                                            newdateF+="."+x
                                        else:
                                            pass 
                            else:
                                newdateF = newFromDate[0]
                        else:
                            newdateF = fromDate
                        
                        if isinstance(newToDate,list):
                            if len(newToDate) > 1:
                                newdateT = newToDate[len(newToDate)-1]
                                for x in reversed(newToDate):
                                    if len(x) == 4:
                                        pass
                                    else:
                                        if len(x) == 2:
                                            newdateT+="."+x
                                        else:
                                            pass 
                            else:
                                newdateT = newFromDate[0]
                        else:
                            newdateT = toDate
                        
                        #print(document)
                        newdate=newdateF+"%"+newdateT
                        datetype = "fromTo"

                    else:
                        datelements = dateobject.split(".")
                        newdate = datelements[len(datelements)-1]
                        datetype = "instance"
                        for x in reversed(datelements):
                            if len(x) == 4:
                                pass
                            else:
                                if len(x) == 2:
                                    newdate+="-"+x
                                else:
                                    break      
                    #print(document,rawdate,"("+newdate+")")
                    if mismatch == True:
                            print("Resolution:",newdate,newFromDate,newToDate)
                    CHRONODICT[document]["date"] = newdate # Set the dict item's date to newdate
                    CHRONODICT[document]["datetype"] = datetype

In [None]:
listofbaddies

# Placenames v2

In [None]:
## Places
# With the dates safely stored away, let's go for some place names.
if os.path.isfile("ID_sted-verdier.xlsx"):
    print("Detected ID_sted-verdier.xlsx")
    if os.path.exists("xml-filer"):
        print("XML>CMIF placename augmentation enabled")
        listXMLfiles = glob.glob("xml-filer/**/*.xml",recursive=True)
        hasXMLs = True
    else:
        print("No XML files provided. XML>CMIF placename augmentation disabled")
else:
    print("No ID_sted-verdier file provided. Skipping placename augmentation.")

In [None]:
#n = 0
if hasXMLs == True: # If the XML files should be used for updating
    # Check to see if CHRONODICT is alive or not. If it is, use it as destination. If it isn't, create it.
    try:
        if CHRONODICT:
            print("CHRONODICT found with content")
        else:
            print("CHRONODICT found without content")
    except:
        CHRONODICT = collections.defaultdict(dict)
        print("CHRONODICT created")

    addrsFoundInXMLs = [] # Make a simple list to hold the short names of every file we've found addresses for
    xmlswithnoaddress = [] # Simple list for XMLs that have no address that can be printed later.
    plainnames = [] # Simple list for plain names of XMLs that have been checked.
    i=0
    for item in listXMLfiles:
        addrKey = "NONE" # Just in case
        find_address = [] # Ensure that this list clears on start of each item
        findFileName = item.split("\\") # Make filepath a list
        findFileName = findFileName[len(findFileName)-1] # Get the path destination file
        
        chkStr = findFileName[0:2] # Check the incoming ID - it's either PN+4 positions, or No+9 positions long.
        if chkStr == "PN": # 6 positions
            filenamePlain = findFileName[0:6] # Must be bounded to remove .xml as well as pagination from filename
            #print(findFileName,filenamePlain)
        elif chkStr == "No": # 11 positions
            filenamePlain = findFileName[0:11] # Must be bounded to remove .xml as well as pagination from filename
            #print(findFileName,filenamePlain)
        else: # This doesn't occur unless you've got files that don't belong here
            print("PROBLEM IN NAME PROCESSING",findFileName,chkStr) 
        if filenamePlain not in plainnames: # For every unique name, add to plainnames
            plainnames.append(str(filenamePlain)) # Just in case we need them later
        print ("\r","Progress:",round(i/len(listXMLfiles)*100),"%", end='\t')
        #print ("\r",n*0.1, end='')
        #n+=1
        if filenamePlain in addrsFoundInXMLs: # if we found the address for this xml
            pass # Skip if we've already found an address for this XML filename
        else:
            with open(item, "r", encoding="utf-8") as file: # Open a file
                letterfile = file.readlines() # Les innholdet som linjer
                letterfile = "".join(letterfile) # Linjene blir kombinert i en variabel
            soup = BeautifulSoup(letterfile, features="xml") # It is now soup
            #find_address = soup.findAll("address") # Look for addrline element.
            #augh,addrKey = "","" # Reset
            foundSender,foundRecipient = False,False # Reset

            ## Code below enables retrieval of an address enclosed in a dateline element.
            ## This is understood to be the sender's address.
            if foundSender == False: # If a recipient has not been found
                find_address = soup.find("dateline") # Look for a dateline element
                if find_address: # If there is a dateline element:
                    #print("Dateline in",findFileName)
                    find_address = find_address.findChild("placeName", recursive=True) # Get the placename
                    #print("DATELINE",find_address)
                    try: # There are documents with datelines but no locations in them confirmed.
                        addrKey = find_address.get('key') # Get the internal ID of the placename
                        try:
                            addrKey = addrKey.replace("pl","") # Remove "pl" prefix
                            addrsFoundInXMLs.append(filenamePlain) # Add the filename to the list of XMLs already found
                            #placenamedict[filenamePlain]["location"] = addrKey
                            CHRONODICT[filenamePlain]["location"] = addrKey
                            foundSender = True
                        except:
                            print("Problem with appending")

                    except:
                        if filenamePlain not in xmlswithnoaddress:
                            xmlswithnoaddress.append(filenamePlain)
                        #print("Dateline without address in",findFileName)
                else:
                    if filenamePlain not in xmlswithnoaddress:
                        xmlswithnoaddress.append(filenamePlain)
        i+=1
print("\t")

In [None]:
if hasXMLs == True:
    placenamedf = pd.read_excel("ID_sted-verdier.xlsx").dropna(axis=1, how='all').dropna(axis=0, how='all').reset_index(drop=True)
    placenamedf = placenamedf.fillna("N/A")
    #for item in placenamedict:
    for item in CHRONODICT:
        if keys_exists(CHRONODICT,item,'location'):
            #print("\tSent from:",CHRONODICT[item]['location'])
            #print(item)
            try:
                placenameSearch = placenamedf[placenamedf['ID'].astype(str) == str(CHRONODICT[item]['location'])]
                stedsnavn = placenameSearch["sted"].values[0]
                regionnavn = placenameSearch["region, nasjonal"].values[0]
                landnavn = placenameSearch["land"].values[0]
                kontinent = placenameSearch["region, internasjonal"].values[0]
                try:
                    if stedsnavn != "N/A":
                        #print("\tSted:",stedsnavn)
                        CHRONODICT[item]['location'] = stedsnavn
                    else:
                        if regionnavn != "N/A":
                            #print("\tRegion:",regionnavn)
                            CHRONODICT[item]['location'] = regionnavn
                        else:
                            if landnavn != "N/A":
                                #print("\tLand:",landnavn)
                                CHRONODICT[item]['location'] = landnavn
                            else:
                                if kontinent != "N/A":
                                    #print("\tKontinent:",kontinent)
                                    CHRONODICT[item]['location'] = kontinent
                                else:
                                    print("\tFant ikke stedsnavn.")
                except:
                    print("\t\tCouldn't match, I guess?")
            except:
                print("\t\tNo match for",str(CHRONODICT[item]['location']))
        #print("\n")

In [None]:
df = pd.DataFrame.from_dict(CHRONODICT)
chronoDF = df.T.fillna("N/A").reset_index(drop=False)
# Check for presence of date column (otherwise, would crash if no chronology used)
if 'date' not in chronoDF.columns:
    chronoDF['date'] = np.nan
    print("Filled date column with NaN")
# Check for presence of location column (otherwise, would crash if no letter XMLs used)
if 'location' not in chronoDF.columns:
    chronoDF['location'] = np.nan
    print("Filled location column with NaN")
chronoDF = chronoDF.rename(columns={"index": "document"}).sort_values(by=['document']).reset_index(drop=True)
places = 0
dates = 0
items = 0
for idx,row in chronoDF.iterrows():
    if row['date'] == "N/A" and row['location'] == "N/A":
        print(idx)
    else:
        #print(idx)
        items += 1
        if row['date'] != "N/A":
            #print(row['date'])
            dates += 1
        if row['location'] != "N/A":
            #print(row['location'])
            places += 1
        #print("-")
print(places,"places and",dates,"dates added over a total of",items,"items.")
chronoDF.to_csv("preprocessed.csv", sep=',', encoding='utf-8',index=False)
print("Preprocessing complete. Saved to preprocessed.csv. Printing report as Preprocessor Report.txt.")

if i > 0:
    pass
else:
    i=0

goodstring = "MXMLM Preprocessor"
try:
    if plainnames:
        goodstring += "\nChecked "+str(i)+" files, of which "+str(len(plainnames))+" were identified as letters, of which "+str(len(addrsFoundInXMLs))+" had addresses."
    else:
        goodstring += "\nCAUTION Checked "+str(i)+" (zero?) files, of which "+str(len(plainnames))+" were identified as letters, of which "+str(len(addrsFoundInXMLs))+" had addresses."
except:
    goodstring+= "\nDidn't use placename augmentation?"
goodstring+="\n"+str(places)+" places and "+str(dates)+" dates added over a total of "+str(items)+" items."
if len(listofbaddies) > 0:
    errorstring = "Bad document IDs from the Chronology that could not be resolved:\n"
    for x in listofbaddies:
        errorstring+="\""+x+"\" "
    errorstring = errorstring.rstrip()
else:
    errorstring = "Didn't identify any bad document IDs that couldn't be resolved."
outputstring = goodstring+"\n"+errorstring

with open("Preprocessor Report.txt", "w", encoding="utf-8") as output_file:
    output_file.write(outputstring)

In [None]:
if runFullProcessor == 0:
    raise KeyboardInterrupt
else:
    print("Proceeding with full data processing.")

# Correspondence

In [None]:
## Main file: Correspondence
# Decrypting XML files is slow, and doing it while constructing an XML is unneccessarily complex.
# Let's just process the XML files here.

# Because Correspondence and Register_Tei are slightly different, I'm writing specific code for each despite
# the fact that it's somewhat WET programming.

if os.path.isfile("correspondence.xml"):
    CorrespDict = collections.defaultdict(dict)
    print("CorrespDict initiated")
    print("Melting correspondence.xml")
    with open("correspondence.xml", "r", encoding="utf-8") as file: # Open a file
        tei = file.readlines() # Les innholdet som linjer
        tei = "".join(tei) # Linjene blir kombinert i en variabel
    soup = BeautifulSoup(tei, from_encoding="UTF-8", features="xml") # It is now soup
    # Don't worry about the error screaming about Unicode markup being provided twice
    print("Correspondence is now soup.")
    comments = 0
    commentDocs = 0
    for comment in soup.findAll(string=lambda text: isinstance(text, Comment)):
        if "xml:id=\"" in comment:
            commentDocs+=1
        comment.extract()
        comments+=1
    if comments > 0:
        print("Destroyed",comments,"<!--comments-->, of which",commentDocs,"contained an @XML:ID.")
    # ... and checking it twice.
    comments = soup.findAll(string=lambda text: isinstance(text, Comment))
    if comments:
        print("There are still",len(comments),"comments present.")
    else:
        print("All comments destroyed.")
        
    print("\nInitializing documentID scan.")
    documentIDs = []
    for document in soup.findAll("div", {"xml:id":True}):

        # Look for the document type assignment.
        documentType = document.find("list", {"type" : "objectType"}).findChild(True, recursive=True)#.attrs['n']
        # Checks if the words "letter" or "brev" appear in the type
        if "brev" in documentType or "letter" in documentType: 
            # Get the document ID from the <div> element.
            documentID = list(document.attrs.values())[0]
            documentIDs.append(documentID)
    print("Acquired",len(documentIDs),"documents classed as letters.\n")
        
    # Slight repetition - breaking DRY, I know - but by referencing documentIDs we're 100% only treating letters,
    # not wasting time on irrelevant documents
    i=1
    for eachID in documentIDs:
        
        i+=1
        #MM_K3421
        docAuthors = [] # List of authors to be included in the dict.
        docAuthorRefs = [] # List of authors' reference URLs.
        docRecipients = [] # List of recipients to be included in the dict. 

        #print(eachID)
        # Munch is the recipient of everything in correspondence.xml.
        recipient = "Edvard Munch"

        # Target the document as var "document"
        document = soup.find("div", {"xml:id":eachID})

        # Target the author(s) as authorNameList
        authorNameList = document.find("item", {"n":"sender"}).findChildren(True, recursive=True)
        X=0
        for name in authorNameList:
            try:
                authorName = authorNameList[X].contents[0]
                try:
                    targetRef = authorName['target']
                except:
                    targetRef = "N/A"
            except:
                authorName = "N/A"
                targetRef = "N/A"
            
            # Data cleaning
            authorName = authorName.replace(","," ")
            #authorName = authorName.replace(";"," ")
            #authorName = authorName.replace("["," ")
            #authorName = authorName.replace("]"," ")
            #authorName = authorName.replace("?"," ")
            authorName = re.sub(' +', ' ',authorName)
            authorName = authorName.strip()
            
            
            X+=1
            docAuthors.append(authorName)
            docAuthorRefs.append(targetRef)

        statusMessage = str(eachID)+" "+str(len(authorNameList))+" author"
        if len(docAuthors) > 1:
            statusMessage += "s"
        statusMessage+=" ("
        z = 1
        for author in docAuthors:
            if author != "N/A":
                statusMessage+=str(author)
            else:
                statusMessage+="N/A (error?)"
            if z < len(docAuthors):
                statusMessage+=","
            z+=1
        statusMessage+=") addressed to: "+str(recipient)


        isDocumentUndated = document.find("item", {"n":"undated"})
        if eachID in CHRONODICT:
            try:
                newdate = CHRONODICT[eachID]['date']
                gotDate = True
            except:
                newdate = "N/A"
                gotDate = False
            try:
                place = CHRONODICT[eachID]['location']
                gotPlace = True
            except:
                place = "N/A"
                gotPlace = False
        else:
            gotDate = False
            gotPlace = False
        #if gotDate == False: # Check if we got an ID
        if isDocumentUndated:
            # Document is straight up undated.
            date = "s.d."
            datetype = "N/A"

        else:
            #statusMessage+="\n>Dated: "
            isDocumentFromTo = document.find("date", {"from":True}) # Does the date element have a from assignment? 
            # ! Using "from" because PN1350 does not have a fromTo attr despite using fromTo. Uses "from", though. Works fine.
            if isDocumentFromTo: # If it does, and thus has a range (JK, No-MM_T1296 has FROM attr but not a TO attr.)
                doesDocumentHaveToDate = document.find("date", {"to":True})
                #statusMessage+="range, "
                if doesDocumentHaveToDate:
                    # Both from and to attributes are present.
                    fromDate = isDocumentFromTo['from'] # Extract 'from' date. 
                    toDate = isDocumentFromTo['to'] # Extract 'to' date.
                    datetype = "fromTo"
                    date = str(fromDate)+"%"+str(toDate)
                    #statusMessage+=datetype+" "+str(fromDate)+"-"+str(toDate)
                else:
                    # If the 'from' attribute is present without the 'to', it's interpreted as "not before this date".
                    date = isDocumentFromTo['from']
                    fromDate = isDocumentFromTo['from']
                    datetype = "notBefore"

            else: # If it doesn't:
                #statusMessage+="instance, "
                yearSent = document.find("date", {"type":"year","when":True}) # Check for year element
                monthSent = document.find("date", {"type":"month","when":True}) # Check for month element
                daySent = document.find("date", {"type":"day","when":True}) # Check for day element
                if yearSent:
                    datetype = "instance"
                    date = yearSent.attrs["when"]
                    if monthSent: # Only look for a month if there's a year. That 1 letter with just month/day, tho...
                        M = re.sub('[-]', '', monthSent.attrs["when"]) # Strip the random '-' characters in here.
                        date+="-"+str(M) # Join month to year by YYYY-MM.
                        if daySent: # Only applies if there is a month AND a day. No point having a day if you don't have a month.
                            M = re.sub('[-]', '', daySent.attrs["when"]) # Strip the random '-' characters in here, too.
                            date+="-"+str(M) # Join day to year-month by YYYY-MM-DD.

                else: 
                # If it doesn't have a year, make one last check
                    doesDocumentHaveToDate = document.find("date", {"to":True}) # if the date just has a to date...

                    if doesDocumentHaveToDate:
                    # If the 'to' attribute is present without the 'from', it's interpreted as "not after this date".
                        datetype = "notAfter"
                        date = doesDocumentHaveToDate['to']

                    else:
                    # All else has failed. This data is expunged.
                        datetype = "N/A"
                        date = "s.d."


        #statusMessage+="\n"
        if datetype == "N/A":
            statusMessage+="\n(undated)"
        else:
            statusMessage+="\nDate type: "+str(datetype)+" ("+str(date)+")"
        
        if gotDate == True:
            statusMessage+=f" (augmented: {newdate})."
        if gotPlace == True:
            statusMessage+=f"Place augmentation: {place})"
        print(statusMessage)
        CorrespDict[eachID]['authors'] = docAuthors
        CorrespDict[eachID]['date'] = date
        CorrespDict[eachID]['datetype'] = datetype
        CorrespDict[eachID]['recipients'] = recipient
        if gotPlace == True:
            CorrespDict[eachID]['place'] = place
        else:
            CorrespDict[eachID]['place'] = "N/A"
        if gotDate == True:
            CorrespDict[eachID]['newdate'] = newdate
            CorrespDict[eachID]['newdatetype'] = CHRONODICT[eachID]["datetype"]
        else:
            CorrespDict[eachID]['newdate'] = "s.d."
            CorrespDict[eachID]['newdatetype'] = "N/A"
        json_object = json.dumps(CorrespDict, indent=4)

        with open("correspondence.json", "w") as outfile:
            outfile.write(json_object)

else:
    print("No correspondence.xml file provided. MXML will not munch letters to Munch.")

In [None]:
CHRONODICT

CorrespDict

for item in CorrespDict:
    i=1
    print(item+",",CorrespDict[item]['date'])
    for author in CorrespDict[item]['authors']:
        print(i,author)
        i+=1
    print("\n")

# Main Registry

In [None]:
if os.path.isfile("register_tei.xml"):
    RegDict = collections.defaultdict(dict)
    print("RegDict initiated")
    print("Melting register_tei.xml")
    with open("register_tei.xml", "r", encoding="utf-8") as file: # Open a file
        tei = file.readlines() # Les innholdet som linjer
        tei = "".join(tei) # Linjene blir kombinert i en variabel
    soup = BeautifulSoup(tei, from_encoding="UTF-8", features="xml") # It is now soup
    # Don't worry about the error screaming about Unicode markup being provided twice
    print("Registry is now soup.")
    comments = 0
    commentDocs = 0
    for comment in soup.findAll(string=lambda text: isinstance(text, Comment)):
        if "xml:id=\"" in comment:
            commentDocs+=1
        comment.extract()
        comments+=1
    if comments > 0:
        print("Destroyed",comments,"<!--comments-->, of which",commentDocs,"contained an @XML:ID.")
    # ... and checking it twice.
    comments = soup.findAll(string=lambda text: isinstance(text, Comment))
    if comments:
        print("There are still",len(comments),"comments present.")
    else:
        print("All comments destroyed.")
        
    print("\nInitializing documentID scan.")
    documentIDs = []
    for document in soup.findAll("div", {"xml:id":True}):

        # Look for the document type assignment.
        documentType = document.find("list", {"type" : "objectType"}).findChild(True, recursive=True)#.attrs['n']
        # Checks if the words "letter" or "brev" appear in the type
        if "brev" in documentType or "letter" in documentType: 
            # Get the document ID from the <div> element.
            documentID = list(document.attrs.values())[0]
            documentIDs.append(documentID)
    print("Acquired",len(documentIDs),"documents classed as letters.\n")
        
    # Slight repetition - breaking DRY, I know - but by referencing documentIDs we're 100% only treating letters,
    # not wasting time on irrelevant documents
    i=1
    for eachID in documentIDs:
    
        
        i+=1
        #MM_K3421
        docAuthors = [] # List of authors to be included in the dict.
        docRecipRefs = [] # List of authors' reference URLs.
        docRecipients = [] # List of recipients to be included in the dict. 

        #print(eachID)
        # Munch is the author of everything in the registry.
        author = "Edvard Munch"

        # Target the document as var "document"
        document = soup.find("div", {"xml:id":eachID})

        # Target the recipients(s) as recipNameList
        recipNameList = document.find("item", {"n":"recipient"}).findChildren(True, recursive=True)
        X=0
        for name in recipNameList:
            try:
                recipName = recipNameList[X].contents[0]
                try:
                    targetRef = recipName['target']
                except:
                    targetRef = "N/A"
            except:
                recipName = "N/A"
                targetRef = "N/A"
            
            # Data cleaning
            recipName = recipName.replace(",","&#44;")
            recipName = recipName.strip()
            #recipName = recipName.replace(";"," ")
            #recipName = recipName.replace("["," ")
            #recipName = recipName.replace("]"," ")
            #recipName = recipName.replace("?"," ")
            recipName = re.sub(' +', ' ',recipName)
            if eachID == "No-MM_N0725":
                print(name,recipName)
            
            X+=1
            docRecipients.append(recipName)
            docRecipRefs.append(targetRef)

        statusMessage = str(eachID)+" "+str(len(docRecipients))+" recipients"
        if len(docRecipients) > 1:
            statusMessage += "s"
        statusMessage+=" ("
        z = 1
        for recipient in docRecipients:
            if recipient != "N/A":
                statusMessage+=str(recipient)
            else:
                statusMessage+="N/A (error?)"
            if z < len(docRecipients):
                statusMessage+=","
            z+=1
        statusMessage+=") from: "+str(author)


        isDocumentUndated = document.find("item", {"n":"undated"})
        
        if eachID in CHRONODICT:
            try:
                newdate = CHRONODICT[eachID]['date']
                gotDate = True
            except:
                newdate = "N/A"
                gotDate = False
            try:
                place = CHRONODICT[eachID]['location']
                gotPlace = True
            except:
                place = "N/A"
                gotPlace = False
        else:
            gotDate = False
            gotPlace = False
        
        
        if isDocumentUndated:
            # Document is straight up undated.
            date = "s.d."
            datetype = "N/A"

        else:
            #statusMessage+="\n>Dated: "
            isDocumentFromTo = document.find("date", {"from":True}) # Does the date element have a from assignment? 
            # ! Using "from" because PN1350 does not have a fromTo attr despite using fromTo. Uses "from", though. Works fine.
            if isDocumentFromTo: # If it does, and thus has a range (JK, No-MM_T1296 has FROM attr but not a TO attr.)
                doesDocumentHaveToDate = document.find("date", {"to":True})
                #statusMessage+="range, "
                if doesDocumentHaveToDate:
                    # Both from and to attributes are present.
                    fromDate = isDocumentFromTo['from'] # Extract 'from' date. 
                    toDate = isDocumentFromTo['to'] # Extract 'to' date.
                    datetype = "fromTo"
                    date = str(fromDate)+"%"+str(toDate)
                    #statusMessage+=datetype+" "+str(fromDate)+"-"+str(toDate)
                else:
                    # If the 'from' attribute is present without the 'to', it's interpreted as "not before this date".
                    date = isDocumentFromTo['from']
                    fromDate = isDocumentFromTo['from']
                    datetype = "notBefore"

            else: # If it doesn't:
                #statusMessage+="instance, "
                yearSent = document.find("date", {"type":"year","when":True}) # Check for year element
                monthSent = document.find("date", {"type":"month","when":True}) # Check for month element
                daySent = document.find("date", {"type":"day","when":True}) # Check for day element
                if yearSent:
                    datetype = "instance"
                    date = yearSent.attrs["when"]
                    if monthSent: # Only look for a month if there's a year. That 1 letter with just month/day, tho...
                        M = re.sub('[-]', '', monthSent.attrs["when"]) # Strip the random '-' characters in here.
                        date+="-"+str(M) # Join month to year by YYYY-MM.
                        if daySent: # Only applies if there is a month AND a day. No point having a day if you don't have a month.
                            M = re.sub('[-]', '', daySent.attrs["when"]) # Strip the random '-' characters in here, too.
                            date+="-"+str(M) # Join day to year-month by YYYY-MM-DD.

                else: 
                # If it doesn't have a year, make one last check
                    doesDocumentHaveToDate = document.find("date", {"to":True}) # if the date just has a to date...

                    if doesDocumentHaveToDate:
                    # If the 'to' attribute is present without the 'from', it's interpreted as "not after this date".
                        datetype = "notAfter"
                        date = doesDocumentHaveToDate['to']

                    else:
                    # All else has failed. This data is expunged.
                        datetype = "N/A"
                        date = "s.d."


        #statusMessage+="\n"
        if datetype == "N/A":
            statusMessage+="\n(undated)"
        else:
            statusMessage+="\nDate type: "+str(datetype)+" ("+str(date)+")"
        print(statusMessage)
        RegDict[eachID]['authors'] = author
        RegDict[eachID]['date'] = date
        RegDict[eachID]['datetype'] = datetype
        RegDict[eachID]['recipients'] = docRecipients
        if gotPlace == True:
            RegDict[eachID]['place'] = place
        else:
            RegDict[eachID]['place'] = "N/A"
        if gotDate == True:
            RegDict[eachID]['newdate'] = newdate
            RegDict[eachID]['newdatetype'] = CHRONODICT[eachID]["datetype"]
        else:
            RegDict[eachID]['newdate'] = "s.d."
            RegDict[eachID]['newdatetype'] = "N/A"
        json_object = json.dumps(RegDict, indent=4)
        
        document.decompose() # 
        
        with open("registry.json", "w") as outfile:
            outfile.write(json_object)

else:
    print("No correspondence.xml file provided. MXML will not munch letters to Munch.")

if os.path.exists("preprocessed.csv"):
    print("MxmlM located and will use preprocessed data (dates, places).")
    flagPreprocessor = True # We're using preprocessed data
    dfPP = pd.read_csv("preprocessed.csv",sep=",").fillna("N/A") # Fill up NaN with N/A
    
    augments = collections.defaultdict(dict) # New dict to hold the values
    
    ppdocs = [] # Simple list of documents that have additional information from preprocessing
    docIDs_placenames = [] # Quickly identify what docids are getting new places
    docIDs_singledates = [] # Quickly identify what docids are getting new SINGLE dates
    docIDs_fromtodates = [] # Quickly identify what docids are getting FROM-TO RANGE dates
    
    for doc in dfPP['document']:
        ppdocs.append(doc) # Populate list of docs that are getting augmented
        
    
    for idx,row in dfPP.iterrows():
        dat = row['date']
        loc = row['location']
        doc = row['document']
        if dat == "N/A":
            # if date is N/A, we're not doing anything with it
            pass
        elif "%" in dat:
            # Date range - IMPORTANT split the date into two parts if there's a separator (%)
            docIDs_fromtodates.append(doc)
            #augments[doc]['fromdate'] = fromdate
        else:
            docIDs_singledates.append(doc)
            #augments[doc]['date'] = dat
        if loc == "N/A":
            pass
        else:
            docIDs_placenames.append(doc)
        augments[doc]['sender'] = loc
        augments[doc]['date'] = dat
else:
    print("WARNING MxmlM is running WITHOUT preprocessed data (dates, places)!")
    flagPreprocessor = False

for item in RegDict:
    if RegDict[item]['newdate'] != "s.d.":
        if RegDict[item]['date'] != RegDict[item]['newdate']:
            try:
                index = dfCombo.loc[item]
            except:
                print("Failed to locate",item)
            index['date'] = RegDict[item]['newdate']
            index['datetype'] = 
            print(item,RegDict[item]['date'],RegDict[item]['datetype'],"->",RegDict[item]['newdate'],RegDict[item]['newdatetype'])

In [None]:
df1 = pd.DataFrame.from_dict(RegDict).T#.reset_index(drop=False)
df2 = pd.DataFrame.from_dict(CorrespDict).T#.reset_index(drop=False)
dfCombo = df1.append(df2).reset_index(drop=False).rename(columns={'index':'document'})
dfComboFull = dfCombo.copy() # Full copy including old and new dates.
#for idx,row in dfCombo.iterrows():
#    newdatetype,newdate = row['date'],row['newdate']
 #   datetype,newdatetype = row['datetype'],row['newdatetype']
 #   authors,recipients = row['authors'],row['recipients']
 ##   if newdate == "s.d.":
  #      dfCombo.iloc[idx]["newdate"] = date
 #   if newdatetype == "N/A":
  #      dfCombo.iloc[idx]["newdatetype"] = datetype
    # We're going to keep the lists alive for use in creating the CMIF. Practical.
    #if authors == "Edvard Munch":
    #    dfCombo.iloc[idx]['recipients'] = ','.join(recipients)
    #else:
    #    dfCombo.iloc[idx]['authors'] = ','.join(authors)
#dfCombo = dfCombo.drop(["date","datetype"],axis=1).rename(columns={'index':'document','newdate': 'date','newdatetype':'datetype'})

result = dfCombo.to_json(orient="index")
parsed = json.loads(result)
dumped = json.dumps(parsed, indent=4)
with open("MXMLM_Output_Combined.json", "w") as outfile:
    outfile.write(dumped)

newDates,newDateTypes,newPlaces,i = [],[],[],0
for idx,row in dfCombo.iterrows():
    document,date,datetype = row['index'],row['date'],row['datetype']
    if document in CHRONODICT:
        print(document)
        try:
            newDate = CHRONODICT[document]['date']
            #newDate
            if "%" in newDate:
                newDateType = "fromTo"
            else:
                newDateType = "instance"
            #print(newDate,newDateType)
        except:
            newDate,newDateType = "N/A","N/A" # No date
        try:
            newPlace = CHRONODICT[document]['location']
        except:
            newPlace = "N/A" # No location
        newDates.append(newDate)
        newDateTypes.append(newDateType)
        newPlaces.append(newPlace)
        i+=1
    else:
        newDates.append(date)
        newDateTypes.append(datetype)
        newPlaces.append("N/A")
print(f"{i} out of {len(CHRONODICT)} entries passed on")
dfCombo['dates'] = newDates
dfCombo['datetypes'] = newDateTypes
dfCombo['place'] = newPlaces

for idx,row in dfCombo.iterrows():
    if row['date'] != row['dates']:
        print(row['index'],row['date'], row['dates'])

In [None]:
dfCombo

# Production Core

In [None]:
import configparser # Used to easily get statements from the config file
from datetime import date

version = "2.0" # Describes the "program's" state of completion and versioning.

config = configparser.ConfigParser()
config.read("config.ini")
cmifTitle = config.get("statements", "cmifTitle")
editorName = config.get("statements", "editorName")
editorMail = config.get("statements", "editorMail")
cmifUid = config.get("statements", "cmifUid")
publisherURL = config.get("statements", "publisherURL")
publisherName = config.get("statements", "publisherName")
cmifURL = config.get("statements", "cmifURL")
typeOfBibl = config.get("statements", "typeOfBibl")
publicationStatementFull = config.get("statements", "publicationStatementFull")
cmifTitle += " "+str(version) # Title of resulting CMIF

today = date.today() # Sett dato i dag
today = today.strftime("%Y-%m-%d") # Formater dato
currVer = version+" "+today

previouslyRun = "Last executed code was version "+str(currVer)+". All OUTPUT files are current to that version on that date.\n"+str(cmifUid)+"."
print("Version",currVer)

In [None]:
# Create CMIF boilerplate object
CMIFstring = '<?xml-model href="https://raw.githubusercontent.com/TEI-Correspondence-SIG/CMIF/master/schema/cmi-customization.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>'+str(cmifTitle)+'</title><editor>'+str(editorName)+'<email>'+str(editorMail)+'</email></editor></titleStmt><publicationStmt><publisher><ref target="'+str(publisherURL)+'">'+str(publisherName)+'</ref></publisher><idno type="url">'+str(cmifURL)+'</idno> <date when="'+str(today)+'"/><availability><licence target="https://creativecommons.org/licenses/by/4.0/">This file is licensed under the terms of the Creative-Commons-License CC-BY 4.0</licence></availability></publicationStmt><sourceDesc><bibl type="'+str(typeOfBibl)+'" xml:id="'+str(cmifUid)+'">'+str(publicationStatementFull)+'</bibl></sourceDesc></fileDesc><profileDesc><dummy/></profileDesc></teiheader><body><p/></body></text></tei>'
CMIF = BeautifulSoup(CMIFstring,"xml") # Read as XML, not HTML

profileDescElement = CMIF.find('profileDesc') # Target correspondence wrapper

In [None]:
for idx,row in dfCombo.iterrows():
    document,date,datetype,authors,recipients,place = row['document'],row['date'],row['datetype'],row['authors'],row['recipients'],row['place']
    metadata = [date,datetype,authors,recipients,place]
    if authors != "Edvard Munch":
        nAuthor = ' & '.join(authors)
    else:
        nAuthor = authors
    if recipients != "Edvard Munch":
        nRecip = ' & '.join(recipients)
    else:
        nRecip = recipients
    
    print(f"{document}\tby {nAuthor} to {nRecip}\nDated {date} ({datetype}), {place}\n")