In [1]:
# XML, TEI, CMI/F and data handling
from bs4 import BeautifulSoup # Hent BeautifulSoup-modulen (https://www.crummy.com/software/BeautifulSoup/) for XML
from bs4 import Comment # BS4-addon for å håndtere kommentarer <!-- X -->
import re # Regex
import pandas as pd

# Time and date
from datetime import date # Dates
import time # Time

# File and folder handling
import glob # The yeast of thought and mind
import os # Filsystem; mapper, lagring, åpning, etc...
import shutil # Se os+

# *MunchXMLmuncher*
Developed by research assistant Loke Sjølie for the University of Oslo

This script currently consumes 1 file in eMunch's TEIXML format and converts it to a complete CMIF/TEIXML file. The script is customized to ensure high precision, with fallbacks designed around their particular TEIXML files. Be aware that CMIF is purely intended to represent correspondance between individuals, and as such there is *significant* (intentional) data loss in converting to the format.

The script targets documents that have been tagged with **"brev"** or **"letter"**, and extracts from these:
1. Document ID, which is extrapolated to form an eMunch URL
2. Document Author tends to be Edvard Munch, and he is given his customary VIAF ID
3. Document Authored Date, which is converted to YYYY-MM-DD (or YYYY-MM, or YYYY) or a range that can be from or from-to
4. Document Recipient(s), names and IDs

... and then places these in a hierarchy: <CorrespDesc(DocumentID)><*Author*><*Date*/><*/Author*><*Recipient*(s)/>.

The file I was provided does **not** specify locations, but I'm sure that we'll be able to work out how to add those if such information is available. Further development: use glob.glob<sup>(the yeast of thought and mind)</sup> to consume files by folder. Add location data? Redirect recipient IDs to VIAF?

The script can be altered to target all documents with one or more recipients, but many of the documents within that criteria are drafts and/or notes. Alternatively, the script can be further restricted to target only letters with "brev"/"letter" type *and* one or more recipients - but the test file has 0 instances where this would have an effect.
___
Users:
I ask that you do not touch anything below the header **Program** unless you *sort of* know what you're doing. :)

## CMIF Metadata & options

In [2]:
version = "0.9" # Describes the "program's" state of completion and versioning.
cmifUid = "a403c593-09df-4538-8acf-8d459339fca8" # Unique ID. Used in sourceDesc of CMIF. Don't change it without a good reason.
# cmifUid is also used as "source" for the time being in each object. Read more about this in CMIF docs.

### Init, metadata, etc

In [4]:
today = date.today() # Sett dato i dag
today = today.strftime("%Y-%m-%d") # Formater dato
currVer = version+" "+today

previouslyRun = "Last executed code was version "+str(currVer)+". All OUTPUT files are current to that version on that date.\n"+str(cmifUid)+"."
print("Version",currVer)

Version 0.9 2022-09-09


In [22]:
hasCorrespondenceXML,hasTEIXML,hasChrono,hasXMLs = False,False,False,False
programfolder = "MXMLM "+currVer
if os.path.exists(programfolder):
    print("Found directory '% s'" % programfolder)
else:
    os.mkdir(programfolder)
    print("Directory '% s' created" % programfolder)
inputfolder = os.path.join(programfolder,"sourcefiles") # The folder containing the TEI/XML-files to be transformed.
paths = ['correspondence.xml','register_tei.xml']
if not os.path.exists(inputfolder):
    os.mkdir(inputfolder)
    print("Directory '% s' created" % inputfolder)
outputfolder = os.path.join(programfolder,"output") # Output folder
if not os.path.exists(outputfolder):
    os.mkdir(outputfolder)
    print("Directory '% s' created" % outputfolder)
    
lookupChrono = sorted(glob.glob("*Kronologi_Munchs_brev*.xlsx"), key=os.path.getmtime)
x = len(lookupChrono)-1
if x > -1:
    print("Newest chronology file:",lookupChrono[x])
    chronologyFile = lookupChrono[x]
    hasChrono = True
    shutil.copy2(chronologyFile, inputfolder+"/kronologi.xlsx")
    chronologyFile = inputfolder+"/kronologi.xlsx"

if os.path.exists("xml-filer"):
    print("Registered XML files from xml-filer")
    listXMLfiles = glob.glob("xml-filer/**/*.xml",recursive=True)
    hasXMLs = True
    if os.path.isfile(programfolder+"/xml_directory.txt"):
        f = open(programfolder+"/xml_directory.txt", "r")
        xmls = f.readlines() # Les innholdet som linjer
        xmls = "".join(xmls) # Linjene blir kombinert i en variabel
        if str(listXMLfiles) == xmls:
            print("XML files already registered")
            hasXMLs = False
        else:
            f = open(programfolder+"/xml_directory.txt", "w")
            f.write(str(listXMLfiles))
            f.close()
            print("\tUpdated registry")
            hasXMLs = True
    else:
        f = open(programfolder+"/xml_directory.txt", "w")
        f.write(str(listXMLfiles))
        f.close()
        print("\tUpdated registry")
        hasXMLs = True
else:
    print("No XML files provided")


    
for item in paths:
    if os.path.isfile(item):
        a = os.path.getmtime(item)
        if os.path.isfile(inputfolder+"/"+item):
            b = os.path.getmtime(inputfolder+"/"+item)
            print("Found existing instance of",inputfolder+"/"+item)
            if a>b:
                shutil.copy2(item, inputfolder+"/"+item)
                print("\tReplaced older version of",item,"in",inputfolder,"(file's last modified date difference is +"+str(a-b)+str(")"))
            else:
                print("\tUsing existing version of",item,"(file's last modified date difference is",str(a-b)+str(")"))
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
        else:
            shutil.copy2(item, inputfolder+"/"+item)
            print("\tYoink! Copied",item,"to",inputfolder)
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
    else:
        if os.path.isfile(inputfolder+"/"+item):
            print("\tFound existing file",item,"in the sourcefiles directory, but not in the main directory.")
            if item == "correspondence.xml":
                hasCorrespondenceXML = True
            elif item == "register_tei.xml":
                hasTEIXML = True
        else:
            print("\nWARNING Didn't find",item,"in any of the working directories!\n")
filesForMunching = glob.glob(inputfolder+"/*")
if len(filesForMunching) == 0:
    print("Stop! You need to put some file/s (correspondence.xml,register_tei.xml) in the sourcefiles folder for me to eat!")
    raise KeyboardInterrupt
elif hasTEIXML == False and hasCorrespondenceXML == False:
    print("Critical error! I didn't find either of the correspondence.xml OR register_tei.xml files. Means I don't have anything to munch!")
    raise KeyboardInterrupt
else:
    print("\nSummary:")
    for name in filesForMunching:
        if name == inputfolder+"\\correspondence.xml" or name == inputfolder+"\\register_tei.xml":
            print('\t'+name,'will be used')
        elif name == inputfolder+"\\kronologi.xlsx":
            print('\tChronology file will be used')
        else:
            print("\tWARNING Detected unusual file!",name,"may not be a file I can munch!")

Found directory 'MXMLM 0.9 2022-09-09'
Newest chronology file: Kronologi_Munchs_brev_20220831.xlsx
	Updated XML files.
Found existing instance of MXMLM 0.9 2022-09-09\sourcefiles/correspondence.xml
	Using existing version of correspondence.xml (file's last modified date difference is 0.0)
Found existing instance of MXMLM 0.9 2022-09-09\sourcefiles/register_tei.xml
	Using existing version of register_tei.xml (file's last modified date difference is 0.0)

Summary:
	MXMLM 0.9 2022-09-09\sourcefiles\correspondence.xml will be used
	Chronology file will be used
	MXMLM 0.9 2022-09-09\sourcefiles\register_tei.xml will be used


In [118]:
if hasXMLs == True: # If the XML files should be used for updating
    import collections # Facilitate dynamic dict
    placenamedict = collections.defaultdict(dict)
    #placenamedict = {}
    addrsFoundInXMLs = [] # Make a simple list to hold the short names of every file we've found addresses for
    xmlswithnoaddress = []
    i=0
    countAddrsD = 0
    countAddrsA = 0
    for item in listXMLfiles:
        find_address = []
        findFileName = item.split("\\") # Make filepath a list
        findFileName = findFileName[len(findFileName)-1] # Get the path destination file
        chkStr = findFileName[0:2]
        if chkStr == "PN":
            filenamePlain = findFileName[0:6]
        elif chkStr == "No":
            filenamePlain = findFileName[0:11]

        if os.path.isfile(item): # why not doublecheck that it's an item still
            #print ("Progress:",round(i/len(listXMLfiles)*100),"%", end='\r')
            if filenamePlain in addrsFoundInXMLs: # if we found the address for this xml
                pass # Skip if we've already found an address for this XML filename
            else:
                with open(item, "r", encoding="utf-8") as file: # Open a file
                    letterfile = file.readlines() # Les innholdet som linjer
                    letterfile = "".join(letterfile) # Linjene blir kombinert i en variabel
                soup = BeautifulSoup(letterfile) # It is now soup
                find_address = soup.findAll("address") # Look for addrline element.
                #augh,addrKey = "","" # Reset
                foundSender,foundRecipient = False,False # Reset
                if find_address: # If there's an addrline element:
                    print("Address in",findFileName)
                    recipNo = 0
                    senderNo = 0
                    addrcount = -1
                    for addr in find_address:
                        augh,addrKey,isSender = "","","" # Reset
                        addrcount +=1
                        if len(find_address) > 1:
                            print("\n\n",findFileName,"\n\n")
                        isSender = find_address[addrcount].get("n")
                        if isSender == "discussed":
                            pass # Ignore discussed persons/addresses. Reason: irrelevant.
                        elif not isSender:
                            pass
                            #print("IGNORED: Address not tagged as sender or recipient",findFileName)
                        else:
                            print(isSender,find_address[addrcount])
                            try:
                                augh = find_address[addrcount].findChild("placename", recursive=True) # Get the placename in the addrline
                                try:
                                    addrKey = augh.get('key')
                                    if addrKey:
                                        countAddrsA+=1
                                        if isSender == "sender":
                                            senderNo+=1
                                            #print("\tSENDER",addrKey)
                                            placenamedict[filenamePlain]["sender"+str(senderNo)] = addrKey
                                            foundSender = True
                                        elif isSender == "recipient":
                                            recipNo+=1
                                            placenamedict[filenamePlain]["recipient"+str(recipNo)] = addrKey
                                            foundRecipient = True
                                            #print("\tRECIPIENT",addrKey)
                                        else:
                                            print("\nWTF?\n",addrKey)
                                            countAddrsA-=1
                                        addrsFoundInXMLs.append(filenamePlain) # Add the filename to the list of XMLs already found
                                    else:
                                        print("Accessed key, but it was counted as false?",addrKey)
                                except:
                                    print("No key in placename",augh)
                            except:
                                print("No placename child element")
                            
                        
                        #print("ADDRLINE",find_address)
                       # try:
                       #     prefix = "Address element" # debug
                       #     countAddrsA+=1 # Count it
                       #     addrKey = augh.get('key') # Get the internal ID of the placename
                            #print(prefix,filenamePlain,addrKey,"\n") # debug
                            
                       # except:
                        #    pass
                            #print("address without address in",findFileName)
                    print("\n")
                if foundRecipient == False: # If a recipient has not been found
                    find_address = soup.find("dateline") # Look for a dateline element
                    if find_address: # If there is a dateline element:
                        print("Dateline in",findFileName)
                        find_address = find_address.findChild("placename", recursive=True) # Get the placename
                        #print("DATELINE",find_address)
                        try: # There are documents with datelines but no dates in them confirmed.
                            addrKey = find_address.get('key') # Get the internal ID of the placename
                            try:
                                addrsFoundInXMLs.append(filenamePlain) # Add the filename to the list of XMLs already found
                                placenamedict[filenamePlain]["recipient"] = addrKey
                                foundRecipient = True
                                print("\tRECIPIENT",addrKey)
                                prefix = "DATELINE" # debug
                                countAddrsD+=1 # Count it
                            except:
                                print("Found address without key",findFileName)
                        except:
                            pass
                            #print("Dateline without address in",findFileName)
                    else:
                        xmlswithnoaddress.append(findFileName)
                #if find_address: # If there's a value in find_address after checking for addrline/dateline
                #    addrKey = find_address.get('key') # Get the internal ID of the placename
                #    #print(prefix,filenamePlain,addrKey,"\n") # debug
                #    addrsFoundInXMLs.append(filenamePlain) # Add the filename to the list of XMLs already found
                #    # we skip in order to DRASTICALLY increase the speed at which we process the XMLs at minimal cost.
            i+=1
        else:
            print(item,"error")
    print("\r")

Dateline in No-MM_N0033-01.xml
	RECIPIENT pl155
Dateline in No-MM_N0057-00-01r.xml
	RECIPIENT pl148
Dateline in No-MM_N0063-00-01r.xml
	RECIPIENT pl155
Address in No-MM_N0080-02.xml


Dateline in No-MM_N0120-04.xml
	RECIPIENT pl16
Address in No-MM_N0341-00-01v.xml
sender <address n="sender"><addrline><placename key="pl41">Jarlsborgveien 14</placename></addrline></address>


Dateline in No-MM_N0341-00-01v.xml
	RECIPIENT pl2
Dateline in No-MM_N0395-00-01r.xml
	RECIPIENT pl20
Dateline in No-MM_N0490-01.xml
	RECIPIENT pl17
Dateline in No-MM_N0526-01.xml
Address in No-MM_N0552-00-01r.xml
recipient <address n="recipient"><addrline>Hr <persname key="pe504">Kai Christensen</persname></addrline>
<addrline><placename key="pl30">Rantzhansgade 72 <hi rend="raised">4</hi></placename></addrline>
<addrline><placename key="pl22">Kjøbenhavn</placename></addrline></address>


Dateline in No-MM_N0561-00-01r.xml
	RECIPIENT pl41
Dateline in No-MM_N0570-01.xml
	RECIPIENT pl20
Dateline in No-MM_N0668-01.xml


Dateline in No-MM_N0776-00-01r.xml
	RECIPIENT pl70
Dateline in No-MM_N0777-01.xml
	RECIPIENT pl70
Dateline in No-MM_N0780-01.xml
	RECIPIENT pl70
Address in No-MM_N0781-00-01r.xml


 No-MM_N0781-00-01r.xml 


sender <address n="sender"><addrline><seg type="typographicalSpace"></seg><seg type="typographicalSpace"></seg><seg type="typographicalSpace"></seg><orgname key="i104">Cafe Bau<del rend="overwritten"><gap></gap></del>er</orgname></addrline></address>
No key in placename None


 No-MM_N0781-00-01r.xml 




Address in No-MM_N0781-00-01v.xml
recipient <address n="recipient"><addrline>Fraulein <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline>
<addrline><placename key="pl15">Norwegen</placename></addrline></address>


Dateline in No-MM_N0783-00-01r.xml
	RECIPIENT pl17
Address in No-MM_N0784-01.xml
sender <address n="sender" rend="extraAir"><addrlin

Address in No-MM_N0899-01.xml
sender <address n="sender" rend="extraAir"><addrline>Hrr <persname key="pe118">Herbert Esche</persname></addrline>
<addrline><placename key="pl455"><seg type="printedText">PARKSTRASSE 58</seg></placename></addrline>
<addrline><placename key="pl146">Chemnitz</placename> <placename key="pl321">Sachsen</placename></addrline></address>


Address in No-MM_N0900-03.xml
sender <address n="sender"><addrline>Adr. <placename key="pl152">Bad Kösen</placename></addrline>
<addrline><placename key="pl145">Thüringen</placename>.</addrline></address>


Dateline in No-MM_N0901-00-01r.xml
	RECIPIENT pl105
Address in No-MM_N0903-00-01v.xml
recipient <address n="recipient"><addrline>Fraulein <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><del rend="overwritten">Kristi</del><placename key="pl15">Norwegen</placename></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline><placename key="pl1">Aker</placename></addrline></addr

Address in No-MM_N0966-00-02r.xml
recipient <address n="recipient"><addrline rend="extraAir">Frøken <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline></address>


Address in No-MM_N0967-00-01v.xml
recipient <address n="recipient"><addrline>Frøken <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline><placename key="pl1">Aker</placename></addrline></address>


Address in No-MM_N0968-00-01v.xml


 No-MM_N0968-00-01v.xml 


recipient <address n="recipient"><addrline>Frøken <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><placename key="pl566">Solveien</placename></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline><handshift new="NN1"></handshift><placename key="pl15">norge</placename></addrline></address>


 No-MM_N0968-00-01v.xml 


sender <address n="sender"><addrline>Adresse: <

Address in No-MM_N1000-00-01v.xml


 No-MM_N1000-00-01v.xml 


sender <address n="sender"><addrline><handshift new="EM2"></handshift>adr. <orgname key="i126">Grand Hotel Dolder</orgname> <placename key="pl31">Zürich</placename></addrline></address>


 No-MM_N1000-00-01v.xml 


recipient <address n="recipient"><addrline>Frøken <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline rend="extraAir"><placename key="pl60">Nordstrand</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N1001-00-01v.xml
recipient <address n="recipient">
<addrline>Fr. <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline>pr. <placename key="pl20">Oslo</placename> –</addrline>
</address>


Address in No-MM_N1003-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir">Frøken <persname key="pe495">Karen Bjølstad</persname></addrline>
<addrlin

Dateline in No-MM_N1092-01.xml
	RECIPIENT pl16
Dateline in No-MM_N1093-01.xml
	RECIPIENT pl155
Address in No-MM_N1094-00-01v.xml
recipient <address n="recipient"><addrline><seg type="printedText">M</seg>lle <persname key="pe348">Inger Munch</persname></addrline>
<addrline><placename key="pl593">Hauketo</placename>, <placename key="pl395">Lian</placename> <placename key="pl20">Kristiania</placename></addrline>
<addrline><placename key="pl15">Norwege</placename></addrline></address>


Dateline in No-MM_N1095-01.xml
	RECIPIENT pl70
Address in No-MM_N1097-00-01r.xml
sender <address n="sender"><addrline rend="noLineBreakBefore">Docent <persname key="pe63">Bäckstrom</persname></addrline>
<addrline><del rend="overstrike">Vestma<del rend="overwritten"><gap></gap></del>nnagaten 10</del></addrline>
<addrline><placename key="pl172">Vestmannagatan 10</placename></addrline></address>


Dateline in No-MM_N1098-01.xml
	RECIPIENT pl38
Dateline in No-MM_N1099-01.xml
	RECIPIENT pl17
Address in No-MM_N11

Address in No-MM_N1154-00-01v.xml
recipient <address n="recipient"><addrline rend="noIndent">Frøken <persname key="pe348">Inger Munch</persname></addrline>
<addrline><placename key="pl242">Munkeengsveien 8</placename></addrline>
<addrline><placename key="pl568">Sm<del rend="overwritten"><gap></gap></del>estad st</placename>.</addrline>
<addrline><placename key="pl1">Vestre Aker</placename></addrline></address>


Address in No-MM_N1155-00-01r.xml
sender <address n="sender"><addrline><placename key="pl2">Skøien</placename></addrline></address>


Dateline in No-MM_N1156-00-01r.xml
Dateline in No-MM_N1162-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N1163-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N1164-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N1165-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N1166-01.xml
	RECIPIENT pl25
Dateline in No-MM_N1167-00-01r.xml
	RECIPIENT pl2
Address in No-MM_N1168-00-01v.xml
recipient <address n="recipient"><addrline rend="noIndent">Frøken <persname key="pe3

Address in No-MM_N1380-00-02r.xml
recipient <address n="recipient"><addrline>Frk <persname key="pe348">Inger Munch</persname></addrline>
<addrline><placename key="pl414">Sorgenfrigt 1</placename></addrline>
<addrline>Oslo</addrline></address>


Address in No-MM_N1383-00-01v.xml
recipient <address n="recipient"><addrline>Frk. <persname key="pe348">Inger Munch</persname></addrline>
<addrline><placename key="pl566">Solveien</placename></addrline>
<addrline><placename key="pl60">Nordstrand</placename></addrline>
<addrline><add place="inBetweenLines"><placename key="pl20">Oslo</placename></add></addrline>
<addrline>Norwege</addrline></address>


Address in No-MM_N1390-00-02r.xml
recipient <address n="recipient"><addrline rend="extraAir">Frøken <persname key="pe348">Inger Munch</persname></addrline>
<addrline><placename key="pl414">Sorgenfrigd. 1</placename></addrline>
<addrline><placename key="pl20">Oslo</placename></addrline></address>


Address in No-MM_N1405-00-01v.xml
recipient <address

Address in No-MM_N1644-00-01r.xml
recipient <address n="recipient"><addrline>Hr Chauffør <persname key="pe492">Bellet</persname></addrline>
<addrline><placename key="pl531">Thomas Heftysgt</placename></addrline>
<addrline>43 eller 44</addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Dateline in No-MM_N1649-02.xml
	RECIPIENT pl29
Dateline in No-MM_N1651-00-01v.xml
	RECIPIENT pl2
Dateline in No-MM_N1655-01.xml
	RECIPIENT pl29
Dateline in No-MM_N1657-01.xml
	RECIPIENT pl213
Dateline in No-MM_N1658-00-01v.xml
	RECIPIENT pl2
Address in No-MM_N1660-04.xml
sender <address n="sender"><addrline rend="noIndent"><placename key="pl269">Lützowstr. 82</placename>.</addrline></address>


Dateline in No-MM_N1661-00-01r.xml
	RECIPIENT pl148
Dateline in No-MM_N1662-01.xml
Dateline in No-MM_N1662-02.xml
	RECIPIENT pl148
Dateline in No-MM_N1663-00-01r.xml
	RECIPIENT pl148
Dateline in No-MM_N1664-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N1669-01.xml
	RECIPIENT pl26

Dateline in No-MM_N2039-01.xml
	RECIPIENT pl16
Address in No-MM_N2126-00-01v.xml
recipient <address n="recipient"><addrline>Consærvator <persname key="pe458">Jens Thiis</persname></addrline>
<addrline><del rend="overwritten">D</del><placename key="pl35">Trontheim</placename></addrline>
<addrline><placename key="pl15">Norwegen</placename></addrline></address>


Dateline in No-MM_N2163-00-01r.xml
	RECIPIENT pl2
Address in No-MM_N2170-00-01v.xml
recipient <address n="recipient"><addrline>Frau <persname key="pe18">Anna Auerbach</persname></addrline>
<addrline>Hr. Professor <persname key="pe19">Auerbach</persname></addrline>
<addrline><placename key="pl344">Mozartstrazse</placename></addrline>
<addrline><placename key="pl252">Jena</placename></addrline></address>


Dateline in No-MM_N2173-00-01v.xml
	RECIPIENT pl17
Address in No-MM_N2175-00-01v.xml
recipient <address n="recipient"><addrline>Hr Docent</addrline>
<addrline><persname key="pe63">Helge Bæckström</persname></addrline>
<addrline><

Address in No-MM_N2263-00-01r.xml
sender <address n="sender"><addrline><placename key="pl220">Kochstrasse 21</placename> <placename key="pl22">Kopenhagen</placename></addrline></address>


Dateline in No-MM_N2267-00-01v.xml
	RECIPIENT pl2
Address in No-MM_N2280-00-01v.xml
recipient <address n="recipient"><addrline>Hr Journalist <persname key="pe536">Kirkeby</persname></addrline>
<addrline>Adr. <orgname key="i67">Politiken</orgname></addrline>
<addrline><placename key="pl22">Kjøbenhavn</placename></addrline></address>


Address in No-MM_N2283-02.xml
sender <address n="sender"><addrline><orgname key="i331">Hotel Sanssouci</orgname></addrline>
<addrline><placename key="pl417">Linkstrasse</placename></addrline>
<addrline><placename key="pl17">Berlin</placename> –</addrline></address>


Dateline in No-MM_N2287-02.xml
	RECIPIENT pl42
Address in No-MM_N2289-00-01r.xml
sender <address n="sender"><addrline>Adr. <placename key="pl32">Moss</placename></addrline></address>


Address in No-MM_N2294

Dateline in No-MM_N2453-00-01r.xml
	RECIPIENT pl32
Address in No-MM_N2463-00-01v.xml
recipient <address n="recipient"><addrline>Hrrn Landgerichts</addrline>
<addrline>direktor <persname key="pe416">Gustav Schiefler</persname></addrline>
<addrline><placename key="pl443">Mellingstedt</placename></addrline>
<addrline>post <placename key="pl442">Bergstedt</placename></addrline>
<addrline><placename key="pl441">Holstein</placename></addrline>
<addrline><placename key="pl51">Tyskland</placename></addrline></address>


Address in No-MM_N2464-00-01v.xml
recipient <address n="recipient"><addrline>Hrr Landgerichtsdirektor</addrline>
<addrline><persname key="pe416">Gustav Schiefler</persname></addrline>
<addrline><placename key="pl443">Mellingstedt</placename></addrline>
<addrline>Post <placename key="pl442">Bergstedt</placename></addrline>
<addrline>pr <placename key="pl441">Holstein</placename></addrline>
<addrline><placename key="pl51">Deutschland</placename></addrline></address>


Dateline in

Address in No-MM_N2660-01.xml


Dateline in No-MM_N2664-01.xml
	RECIPIENT pl16
Dateline in No-MM_N2665-03.xml
	RECIPIENT pl2
Address in No-MM_N2716-00-01r.xml
sender <address n="sender"><addrline><orgname key="i231">Hotel Stadt Riga</orgname> <placename key="pl178">Mittelstrasse</placename></addrline>
<addrline><placename key="pl17">Berlin</placename></addrline></address>


Dateline in No-MM_N2719-00-01v.xml
	RECIPIENT pl2
Dateline in No-MM_N2724-03.xml
	RECIPIENT pl2
Dateline in No-MM_N2738-00-01r.xml
Dateline in No-MM_N2739-01.xml
	RECIPIENT pl32
Address in No-MM_N2741-04.xml
sender <address n="sender"><addrline>Adr. <placename key="pl32">Moss</placename>.</addrline></address>


Address in No-MM_N2745-03.xml


 No-MM_N2745-03.xml 




 No-MM_N2745-03.xml 




Dateline in No-MM_N2746-00-01r.xml
	RECIPIENT pl152
Dateline in No-MM_N2750-00-01r.xml
	RECIPIENT pl2
Dateline in No-MM_N2753-02.xml
	RECIPIENT pl29
Address in No-MM_N2755-00-01r.xml
sender <address n="sender" rend="extraAir"><a

Address in No-MM_N2818-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir">Maleren <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><placename key="pl176">Stensgd 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline>
<addrline><placename key="pl15">Norwegen</placename></addrline></address>


Address in No-MM_N2819-00-01r.xml
recipient <address n="recipient"><addrline>Hrr Maler <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><add place="inBetweenLines">Kgl. Fuldm. <persname key="pe1019">Grønvold</persname></add></addrline>
<addrline><placename key="pl176">Stensgd 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N2820-00-01r.xml
recipient <address n="recipient"><addrline>Hrr Maler <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><add place="inBetweenLines">Kgl. Fuldm. <persname key="pe1019">Grøn

Address in No-MM_N2864-00-01v.xml
recipient <address n="recipient"><addrline>Maleren <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><placename key="pl176">Stensgd 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N2865-00-01r.xml
recipient <address n="recipient"><addrline>Maleren <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><placename key="pl176">Stensgade 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N2866-00-01r.xml
recipient <address n="recipient"><addrline>Maleren</addrline>
<addrline>Hrr <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline>Speilsalen (Aften)</addrline>
<addrline><orgname key="i83">Grand Hotel</orgname></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N2867-00-01r.xml
recipient <address n="recipien

Address in No-MM_N2908-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir">Maleren <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><placename key="pl176">Stensgate 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline>
<addrline><placename key="pl15">Norwegen</placename></addrline></address>


Address in No-MM_N2909-00-01r.xml
sender <address n="sender"><addrline>Adresse <orgname key="i126">Hotel Dolder</orgname> <placename key="pl31">Zurich</placename></addrline></address>


Address in No-MM_N2910-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir">Maleren <persname key="pe392">Ludvig Ravensberg</persname></addrline>
<addrline><placename key="pl176">Stensgd 1</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in No-MM_N2911-00-01r.xml
recipient <address n="recipient"><addrline>Herr Maler <persname key="pe392">Ludvig Ravensberg</persn

Dateline in No-MM_N2967-00-01r.xml
	RECIPIENT pl41
Dateline in No-MM_N2968-00-01r.xml
	RECIPIENT pl41
Address in No-MM_N2973-00-01v.xml
recipient <address n="recipient"><addrline>Musæumsdirektør <persname key="pe458">Jens Thiis</persname></addrline>
<addrline><placename key="pl245">Staffeldtsgd 4 b</placename></addrline></address>


Dateline in No-MM_N2986-00-01r.xml
Dateline in No-MM_N3004-00-01r.xml
	RECIPIENT pl105
Address in No-MM_N3005-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir">Hr Redaktør <persname key="pe139">Chr Gierløff</persname></addrline>
<addrline><placename key="pl265">Haugesund</placename></addrline>
<addrline><orgname key="i382">Bergensbanen</orgname></addrline></address>


Dateline in No-MM_N3006-00-01r.xml
	RECIPIENT pl41
Address in No-MM_N3007-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir">Hrr redaktør <persname key="pe139">Chr. Gierløff</persname></addrline>
<addrline><placename key="pl265">Haugesund</placename></add

Address in No-MM_N3066-00-01v.xml
recipient <address n="recipient"><addrline><handshift new="EM1"></handshift>Forfatteren <persname key="pe146">Emanuel Goldstein</persname></addrline>
<addrline>Hr Grosserer <persname key="pe932">Goldstein</persname></addrline>
<addrline><del hand="NN1" rend="overstrike"><placename key="pl278">Vesterbrogad</placename></del> <del hand="NN2" rend="overstrike">23(?)</del></addrline>
<addrline><add hand="NN1" place="inBetweenLines"><placename key="pl279">Taarbæk</placename></add></addrline>
<addrline><del hand="NN1" rend="overstrike"><placename key="pl22">Kjøbenhavn</placename></del></addrline></address>


Address in No-MM_N3067-00-01v.xml
recipient <address n="recipient"><addrline>Til forfatteren</addrline>
<addrline rend="extraAir"><persname key="pe146">Emanuel Goldstein</persname></addrline>
<addrline><placename key="pl280">Kristianiagade</placename> 3</addrline>
<addrline><placename key="pl22">Kjøbenhavn</placename></addrline></address>


Address in No-

	RECIPIENT pl29
Address in No-MM_N3167-00-01r.xml
recipient <address n="recipient"><addrline>Hrrn <persname key="pe257">Albert Kollmann</persname></addrline>
<addrline><placename key="pl45">Florenz</placename></addrline>
<addrline><placename key="pl258">via Taddea No 2</placename> <unclear reason="writing">pp</unclear> 2</addrline>
<addrline><placename key="pl90">Italien</placename></addrline></address>


Address in No-MM_N3168-00-01r.xml
recipient <address n="recipient"><addrline>Hrn <persname key="pe257">Albert Kollmann</persname></addrline>
<addrline><orgname key="i186">Beyers Hotel</orgname></addrline>
<addrline><placename key="pl253">Schadowstr. 1A</placename></addrline>
<addrline><placename key="pl17">Berlin</placename></addrline></address>


Address in No-MM_N3169-00-01r.xml
recipient <address n="recipient"><addrline>Hrn <persname key="pe257">Albert Kollmann</persname></addrline>
<addrline><orgname key="i186">Beyers Hotel</orgname></addrline>
<addrline><placename key="pl253">Sch

Address in No-MM_N3209-00-01v.xml
recipient <address n="recipient"><addrline>Hrrn <persname key="pe257">A. Kollmann</persname></addrline>
<addrline>Adr. Hrr. Maler <persname key="pe514">von Flotow</persname></addrline>
<addrline><add place="supralinear"><hi rend="underlined"><placename key="pl204">Grussow</placename></hi></add><placename key="pl201">Mecklenburg</placename></addrline>
<addrline><handshift new="NN1"></handshift><placename key="pl203">Malchow</placename></addrline></address>


Address in No-MM_N3210-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir">Hrn <persname key="pe257">Albert Kollmann</persname></addrline>
<addrline>Adr. Hrrn Maler <persname key="pe514">von Flotow</persname></addrline>
<addrline><placename key="pl204">Grüssow</placename> in <placename key="pl203">Malchow</placename></addrline>
<addrline><placename key="pl201">Mecklenburg</placename></addrline></address>


Address in No-MM_N3211-00-01r.xml
recipient <address n="recipient"><addrline

Address in No-MM_N3252-00-01r.xml
recipient <address n="recipient" rend="extraAir"><addrline><handshift new="EM2"></handshift>Hr overlærer <persname key="pe217">Sigurd Høst</persname></addrline>
<addrline><del hand="NN1" rend="overstrike"><placename key="pl217">Borre</placename></del></addrline>
<addrline><handshift new="NN1"></handshift><del rend="overstrike"><placename key="pl213">Meltzers gt 7</placename></del></addrline>
<addrline><handshift new="NN2"></handshift><hi rend="underlined"><placename key="pl217">Borre</placename>.</hi> <del rend="overstrike"><handshift new="NN1"></handshift><placename key="pl20">Oslo</placename>.</del></addrline></address>


Address in No-MM_N3257-03.xml
sender <address n="sender"><addrline><placename key="pl220">Kocksvei 21</placename></addrline>
<addrline><placename key="pl22">K<del rend="overwritten">openhagen</del>jøbenhavn</placename></addrline></address>


Address in No-MM_N3258-00-01r.xml
recipient <address n="recipient"><addrline>Hr Adjunkt <per

Address in No-MM_N3311-00-01r.xml
recipient <address n="recipient"><addrline>Hr professor <persname key="pe422">Schreiner</persname></addrline>
<addrline><orgname key="i375">Den anatomiske afdeling</orgname></addrline></address>
No key in placename None


Address in No-MM_N3312-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir">Hr professor <persname key="pe422">Kristian Schreiner</persname></addrline>
<addrline><orgname key="i113">Universitet</orgname></addrline>
<addrline><orgname key="i375">Anatomisk institut</orgname></addrline>
<addrline><placename key="pl20">Oslo</placename></addrline></address>


Address in No-MM_N3313-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir">Hr professor <unclear reason="writing">A</unclear> <persname key="pe422">Schreiner</persname></addrline>
<addrline><placename key="pl160">Hofchef Løwenskjolds vei</placename></addrline>
<addrline><placename key="pl161">Lilleaker</placename></addrline></address>


Address in No

Address in No-MM_N3381-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir"><orgname key="i46">Munchener Neue Secission</orgname></addrline>
<addrline><placename key="pl187">Ainmillerstr. 43</placename></addrline>
<addrline><placename key="pl19">München</placename></addrline></address>


Address in No-MM_N3382-00-01r.xml
recipient <address n="recipient"><addrline>Hrrn Prof. <persname key="pe880">A Skinnerer</persname></addrline>
<addrline><orgname key="i46">Münchener neue Secission</orgname></addrline>
<addrline><placename key="pl187">Ainmillerstr 43</placename></addrline>
<addrline><placename key="pl19">Munchen</placename></addrline></address>


Address in No-MM_N3383-00-01r.xml
recipient <address n="recipient"><addrline rend="extraAir">Hrrn <persname key="pe880">A. Skinnerer</persname></addrline>
<addrline><del rend="overwritten">Neue</del><orgname key="i46">Munchener neue Secission</orgname></addrline>
<addrline><placename key="pl187">A<del rend="overwritten"><gap><

Address in No-MM_N3512-00-01v.xml
recipient <address n="recipient"><addrline>Maleren <persname key="pe627">E Munch</persname></addrline>
<addrline>Villa <placename key="pl41">Ekely</placename></addrline>
<addrline><placename key="pl147">Skøien St</placename></addrline></address>


Address in No-MM_N3513-00-01v.xml
recipient <address n="recipient"><addrline rend="extraAir"><del rend="overwritten">M</del>Maler <persname key="pe627">Edv. Munch</persname></addrline>
<addrline>Villa <placename key="pl41">Ekely</placename> <placename key="pl2">Skøien</placename></addrline></address>


Dateline in No-MM_N3514-04.xml
	RECIPIENT pl79
Dateline in No-MM_N3516-00-01r.xml
	RECIPIENT pl70
Dateline in No-MM_N3527-00-01r.xml
	RECIPIENT pl25
Dateline in No-MM_N3542-06.xml
	RECIPIENT pl20
Dateline in No-MM_N3550-04.xml
	RECIPIENT pl41
Dateline in No-MM_N3551-08.xml
	RECIPIENT pl1
Dateline in No-MM_N3552-08.xml
	RECIPIENT pl41
Dateline in No-MM_N3553-04.xml
	RECIPIENT pl1
Dateline in No-MM_N3557-00-01r.x

Address in PN0218_56227A00003.xml
recipient <address n="recipient"><addrline rend="extraAir">Hr Konsul <persname key="pe277">Harald Kaarbø</persname></addrline>
<addrline><orgname key="i83">Grand Hotel</orgname></addrline></address>
No key in placename None


Dateline in PN0223_56237A00001.xml
	RECIPIENT pl41
Dateline in PN0224_56238A00002.xml
	RECIPIENT pl2
Dateline in PN0228_56242A00001.xml
	RECIPIENT pl20
Dateline in PN0229_56243A00001.xml
	RECIPIENT pl41
Address in PN0231_56245A00001.xml
sender <address n="sender"><addrline rend="noLineBreakBefore">Brevadresse <placename key="pl41">Ekely</placename></addrline></address>


Dateline in PN0231_56245A00001.xml
Dateline in PN0232_56246A00001.xml
	RECIPIENT pl20
Dateline in PN0233_56248A00002.xml
	RECIPIENT pl41
Address in PN0235_56249A00001.xml
recipient <address n="recipient"><addrline rend="extraAir">Hr konsul <persname key="pe277">H. Kaarbø</persname></addrline>
<addrline><placename key="pl389">Svolvær</placename></addrline>
<addrlin

Address in PN0333_56217A00001.xml
recipient <address n="recipient"><addrline>Maleren <persname key="pe203">Th. Holmboe</persname></addrline>
<addrline><placename key="pl106">Olaf Kyrresgd 11</placename></addrline>
<addrline><placename key="pl20">Oslo</placename></addrline></address>


Address in PN0706_1_57223A00001.xml
sender <address n="sender" rend="extraAir"><addrline><placename key="pl269">Lutzowstr. 82</placename></addrline>
<addrline><placename key="pl17">Berlin</placename></addrline></address>


Address in PN0707_57224A00004.xml
recipient <address n="recipient"><addrline rend="extraAir"><handshift new="EM1"></handshift>Forfatteren <persname key="pe366">Jappe Nielsen</persname></addrline>
<addrline><del rend="overwritten"><gap></gap></del>Adr Maleren <persname key="pe203">Thorolf Holmboe</persname></addrline>
<addrline><placename key="pl106">Olaf Kyrres Gade</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in PN0708_4_5

Address in PN0751_57667A00002.xml
recipient <address n="recipient"><addrline>Forfatteren <persname key="pe366">Jappe Nilssen</persname></addrline>
<addrline><add place="inBetweenLines">Adr. Maleren <persname key="pe203">Th. Holmboe</persname></add></addrline>
<addrline rend="extraAir"><placename key="pl106">Olaf Kyrresgd 11</placename></addrline>
<addrline><placename key="pl20">Kristiania</placename></addrline></address>


Address in PN0752_57668A00002.xml
sender <address n="sender"><addrline rend="noLineBreakBefore">Adr. Cöln poste restante</addrline></address>
No key in placename None


Dateline in PN0756_56178A00001.xml
	RECIPIENT pl2
Dateline in PN0760_56173A00001.xml
	RECIPIENT pl41
Address in PN0761_3_57701A00002.xml
sender <address n="sender"><addrline><orgname key="i331">Hotel Sansousci</orgname></addrline>
<addrline><placename key="pl417">Linkstrasse</placename> <placename key="pl17">Berlin</placename></addrline></address>


Dateline in PN0766_1_56161A00001.xml
	RECIPIENT pl42

In [119]:
dict(placenamedict)

{'No-MM_N0033': {'recipient': 'pl155'},
 'No-MM_N0057': {'recipient': 'pl148'},
 'No-MM_N0063': {'recipient': 'pl155'},
 'No-MM_N0120': {'recipient': 'pl16'},
 'No-MM_N0341': {'sender1': 'pl41', 'recipient': 'pl2'},
 'No-MM_N0395': {'recipient': 'pl20'},
 'No-MM_N0490': {'recipient': 'pl17'},
 'No-MM_N0552': {'recipient1': 'pl30'},
 'No-MM_N0561': {'recipient': 'pl41'},
 'No-MM_N0570': {'recipient': 'pl20'},
 'No-MM_N0668': {'recipient': 'pl30'},
 'No-MM_N0679': {'recipient': 'pl2'},
 'No-MM_N0685': {'recipient': 'pl42'},
 'No-MM_N0687': {'recipient': 'pl2'},
 'No-MM_N0703': {'recipient': 'pl41'},
 'No-MM_N0711': {'recipient': 'pl41'},
 'No-MM_N0717': {'recipient': 'pl331'},
 'No-MM_N0719': {'recipient1': 'pl543'},
 'No-MM_N0721': {'recipient': 'pl16'},
 'No-MM_N0722': {'recipient1': 'pl395'},
 'No-MM_N0723': {'recipient': 'pl315'},
 'No-MM_N0724': {'recipient': 'pl119'},
 'No-MM_N0725': {'recipient': 'pl151'},
 'No-MM_N0728': {'recipient': 'pl315'},
 'No-MM_N0729': {'recipient': 'pl11

In [107]:
print("Address lines:",countAddrsA,"\nDatelines:",countAddrsD,"\n\tTotal count:",countAddrsA+countAddrsD)

Address lines: 784 
Datelines: 479 
	Total count: 1263


In [None]:
# Users: only edit things that exist WITHIN double quotation marks ("").
cmifTitle = "MunchXMLmuncher version "+str(version) # Title of resulting CMIF
editorName = "Loke Sjølie" # The name to issue to the CMIF file as "editor" (responsible for this file)
editorMail = "loke.sjolie@ub.uio.no" # The e-mail associated with the above.

#publishers = 1 # How many publishers? Add later if required.
publisherURL = "eMunch.no" # Website of publisher #1
publisherName = "eMunch" # Name of publisher #1

cmifURL = "eMunch.no" # URL where this file is located
typeOfBibl = "online" # The type of bibliography that is being described
publicationStatementFull = "[Full bibliographical statement of the scholarly edition or repository where this file points to]"


## Program

### Chronology read

In [None]:
chronology = pd.read_excel(chronologyFile).dropna(axis=1, how='all').dropna(axis=0, how='all').reset_index(drop=True)
chronology = chronology.fillna("N/A")
chronology

### Read/process TEI-XML

In [None]:
# Create CMIF boilerplate object
CMIFstring = '<?xml-model href="https://raw.githubusercontent.com/TEI-Correspondence-SIG/CMIF/master/schema/cmi-customization.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>'+str(cmifTitle)+'</title><editor>'+str(editorName)+'<email>'+str(editorMail)+'</email></editor></titleStmt><publicationStmt><publisher><ref target="'+str(publisherURL)+'">'+str(publisherName)+'</ref></publisher><idno type="url">'+str(cmifURL)+'</idno> <date when="'+str(today)+'"/><availability><licence target="https://creativecommons.org/licenses/by/4.0/">This file is licensed under the terms of the Creative-Commons-License CC-BY 4.0</licence></availability></publicationStmt><sourceDesc><bibl type="'+str(typeOfBibl)+'" xml:id="'+str(cmifUid)+'">'+str(publicationStatementFull)+'</bibl></sourceDesc></fileDesc><profileDesc></profiledesc></teiheader></tei>'
CMIF = BeautifulSoup(CMIFstring)
start = time.time()

# DEBUGGER LISTS
documentsWithBadDates = []
documentsWithNoAuthor = []
documentsWithNoRecipient = []
documentsLackingTargetReference = []

# Declare variables
# ---------------------------------------------------------------
noOfLettersToMunch = 0
errors_found = [] # List of errors found during execution
letterCount = 0 # # letters, as defined by an item having a recipient, processed
miscCount = 0 # non-letter documents, as defined by an item having no recipients, processed
addresseesUnique = [] # List of unique recipients
datetype = 0 # Var for the type of date we're dealing with
noOfRecipients = 0 # Counting non-unique recipients
otherMiscDocCount = 0 # Counting objects that are not letters.
authorID = "" # Reserved for VIAF etc.
# ---------------------------------------------------------------
iii = 0
for path in glob.glob(inputfolder+"/*.xml"):
    inputfilepath = path
    with open(inputfilepath, "r", encoding="utf-8") as file: # Open a file
        tei = file.readlines() # Les innholdet som linjer
        tei = "".join(tei) # Linjene blir kombinert i en variabel
    soup = BeautifulSoup(tei, from_encoding="UTF-8") # It is now soup
    # Don't worry about the error screaming about Unicode markup being provided twice
    print("Souped",inputfilepath)
    # Before handling the data: remove all comments
    # Making a list of <!--comments--> to be destroyed...
    commentDocs = 0 # Used only in terminating comments
    comments = 0 # Used only in terminating comments
    for comment in soup.findAll(string=lambda text: isinstance(text, Comment)):
        if "xml:id=\"" in comment:
            commentDocs+=1
        comment.extract()
        comments+=1
    if comments > 0:
        print("Destroyed",comments,"<!--comments-->, of which",commentDocs,"contained an @XML:ID.")
    # ... and checking it twice.
    comments = soup.findAll(string=lambda text: isinstance(text, Comment))
    if comments:
        print("There are still",len(comments),"comments present.")
    else:
        print("All comments destroyed.")
    # Limit workspace to individual div (document) here.
    profileDescElement = CMIF.find('profiledesc') # Target correspondence wrapper
    # For each Div element with an XML:ID (should be each document)
    for document in soup.findAll("div", {"xml:id":True}):
        
        # Get the document ID from the <div> element.
        
        authorID,authorName,recipient,recipientID = False,False,False,False # debug
        
        # Look for the document type assignment.
        documentType = document.find("list", {"type" : "objectType"}).findChild(True, recursive=True)#.attrs['n']
        if "brev" in documentType or "letter" in documentType: # Checks if the words "letter" or "brev" appear in the type
            # This code applies to letters as directed by the data type.
            documentID = list(document.attrs.values())[0]
            #print(documentType)
            #print("DEBUG Checking",documentID)
            # Check if the document has more than 0 recipients. If there are no recipients, there is no correspAction required.
            

            # Check if the document has an author.
            if "correspondence" in path:
                authorNameList = document.find("item", {"n":"sender"}).findChildren(True, recursive=True)
                ji=0
                for name in authorNameList:
                    authorName = authorNameList[ji]
                    #authorName = document.find("item", {"n":"sender"})
                    try:
                        targetRef = authorName['target']
                    except:
                        targetRef = "NONE"
                        if documentID not in documentsLackingTargetReference:
                            documentsLackingTargetReference.append(documentID)
                    ji+=1
                recipient = "Edvard Munch"
                
            else:
                authorName = document.find("item", {"n":"author"})
                recipient = document.find("item", {"n":"recipient"})
            if authorName:
                #print(authorName)
                try:
                    authorName = authorName.contents[0]
                except:
                    authorName = "No author"
                    print("WARNING:",documentID,"suffered code 201881X - really bad!")
                    errors_found.append("INFO 201881 in "+str(documentID))
                    documentsWithNoAuthor.append(documentID)
            else:
                authorName = "No author"
                print("WARNING:",documentID,"suffered code 201881 no author found!")
                errors_found.append("INFO 201881 in "+str(documentID))
                documentsWithNoAuthor.append(documentID)
            if authorName == "Edvard Munch":
                authorID = "https://viaf.org/viaf/61624802/"
            else:
                authorID = targetRef

            # Attempt to divine the date or date range of the document. Assumes that each document only has 1 date (or 1 range).
            isDocumentUndated = document.find("item", {"n":"undated"})
            if isDocumentUndated:
                date = "s.d."
                datetype = "none"
            else:
                isDocumentFromTo = document.find("date", {"from":True}) # Does the date element have a from assignment? 
                # Using "from" because PN1350 does not have a fromTo attr despite using fromTo. Uses "from", though. Works fine.
                if isDocumentFromTo: # If it does, and thus has a range (JK, No-MM_T1296 has FROM attr but not a TO attr.)
                    doesDocumentHaveToDate = document.find("date", {"to":True})
                    if doesDocumentHaveToDate:
                        fromDate = isDocumentFromTo['from'] # Extract 'from' date. 
                        #date = " ".join(date)
                        toDate = isDocumentFromTo['to'] # Extract 'to' date.
                        datetype = "range"
                    else:
                        date = isDocumentFromTo['from']
                        fromDate = isDocumentFromTo['from']
                        datetype = "fromRange"
                        if documentID not in documentsWithBadDates:
                            documentsWithBadDates.append(documentID)

                else: # If it doesn't:
                    yearSent = document.find("date", {"type":"year","when":True}) # Check for year element
                    monthSent = document.find("date", {"type":"month","when":True}) # Check for month element
                    daySent = document.find("date", {"type":"day","when":True}) # Check for day element
                    if yearSent:
                        datetype = "exact"
                        date = yearSent.attrs["when"]
                        if monthSent: # Only look for a month if there's a year. That 1 letter with just month/day, tho...
                            M = re.sub('[-]', '', monthSent.attrs["when"]) # Strip the random '-' characters in here.
                            date+="-"+str(M) # Join month to year by YYYY-MM.
                            if daySent: # Only applies if there is a month AND a day. No point having a day if you don't have a month.
                                M = re.sub('[-]', '', daySent.attrs["when"]) # Strip the random '-' characters in here, too.
                                date+="-"+str(M) # Join month to year-month by YYYY-MM-DD.
                    else: # If it doesn't have a year, make one last check
                        doesDocumentHaveToDate = document.find("date", {"to":True}) # if the date just has a to date...
                        if documentID not in documentsWithBadDates:
                                documentsWithBadDates.append(documentID)
                        if doesDocumentHaveToDate:
                            datetype = "toRange"
                        else:
                            datetype = "none"
                            date = "s.d."
                            print("WARNING:",documentID,"suffered code 301881 - no year found in a specific-year element. Expected in MM_N1071 and MM_N3734.")
                            errors_found.append("INFO 301881 in "+str(documentID))

            # Construct CMIF author ("sent") element
            correspDescElement = soup.new_tag("correspDesc", attrs={"key":str(documentID), "ref":"https://www.emunch.no/HYBRID"+str(documentID)+".xhtml", "source":cmifUid})
            profileDescElement.append(correspDescElement)
            if "correspondence" in path:
                for name in authorNameList:
                    targetElementCorrespDesc = CMIF.find("correspDesc", attrs={"key":str(documentID)})
                    correspActionElement = soup.new_tag("correspAction", attrs={'type':'sent'})
                    targetElementCorrespDesc.append(correspActionElement)
                    correspActionTarget = targetElementCorrespDesc.findChild("correspAction",attrs={"type": "sent"}, recursive=False)
                    if authorID != "NONE":
                        persNameElement = soup.new_tag("persName", attrs={"ref":authorID})
                    else:
                        persNameElement = soup.new_tag("persName")
                    persNameElement.string = str(authorName)

                    correspActionTarget.append(persNameElement)
            else:
                targetElementCorrespDesc = CMIF.find("correspDesc", attrs={"key":str(documentID)})
                correspActionElement = soup.new_tag("correspAction", attrs={'type':'sent'})
                targetElementCorrespDesc.append(correspActionElement)
                correspActionTarget = targetElementCorrespDesc.findChild("correspAction",attrs={"type": "sent"}, recursive=False)
                if authorID != "NONE":
                    persNameElement = soup.new_tag("persName", attrs={"ref":authorID})
                else:
                    persNameElement = soup.new_tag("persName")
                persNameElement.string = str(authorName)

                correspActionTarget.append(persNameElement)

            if datetype == "exact":
                dateSentElement = soup.new_tag("date", attrs={"when":date})
                #print(datetype,date)
            elif datetype == "range":
                dateSentElement = soup.new_tag("date", attrs={"from":fromDate,"to":toDate})
                #print(datetype,fromDate,toDate)
            elif datetype == "fromRange":
                dateSentElement = soup.new_tag("date", attrs={"from":fromDate})
            elif datetype == "toRange":
                dateSentElement = soup.new_tag("date", attrs={"to":fromDate})
                #print(datetype,fromDate)
            elif datetype == "none":
                #print("> NO DATE!",documentID)
                pass
            else:
                print("ERROR 2839 - Unrecognized datetype!")
                errors_found.append("2839")
            if datetype == "none":
                pass
            else:
                # Append date element to correspAction @sent
                correspActionTarget.append(dateSentElement)



            if recipient: # If there are more than 0 recipients:
                letterCount += 1
                i=0
                if recipient == "Edvard Munch":
                    recipientID = "https://viaf.org/viaf/61624802/"
                    recipientType = "persName"
                    recipientName = recipient
                    noOfRecipients += 1
                    noOfLettersToMunch += 1
                    if recipientName not in addresseesUnique:
                        addresseesUnique.append(recipientName)
                    correspActionElement = soup.new_tag("correspAction", attrs={'type':'received'})
                    targetElementCorrespDesc.append(correspActionElement)
                    correspActionTarget = targetElementCorrespDesc.findChildren("correspAction",attrs={"type": "received"}, recursive=False)

                    persNameElement = soup.new_tag("persName", attrs={"ref":recipientID})

                    persNameElement.string = str(recipientName)
                    correspActionTarget[i].append(persNameElement)
                    
                else:
                    recipientList = recipient.findChildren(True) # Get ALL children of the recipient item element. Might be 2+!
                    for each in recipientList: # For every recipient:
                        recipientName = str(each.contents[0]) # Assign a name
                        noOfRecipients += 1
                        if recipientName not in addresseesUnique:
                            addresseesUnique.append(recipientName)
                        recipientID = recipientList[i].attrs["target"] # Assign an ID

                        if "institution" in recipientID:
                            recipientType = "orgName"
                        elif "person" in recipientID:
                            recipientType = "persName"
                        else:
                            print("WARNING:",documentID,"suffered error 20191. Defaulted to person.")
                            recipientType = "persName"
                            errors_found.append("WARNING 20191 in "+str(documentID))

                        correspActionElement = soup.new_tag("correspAction", attrs={'type':'received'})
                        targetElementCorrespDesc.append(correspActionElement)
                        correspActionTarget = targetElementCorrespDesc.findChildren("correspAction",attrs={"type": "received"}, recursive=False)

                        if recipientType == "persName":
                            persNameElement = soup.new_tag("persName", attrs={"ref":recipientID})
                        elif recipientType == "orgName":
                            persNameElement = soup.new_tag("orgName", attrs={"ref":recipientID})

                        persNameElement.string = str(recipientName)
                        correspActionTarget[i].append(persNameElement)
                        i+=1
            else: # If document does not have a recipient, what do we do?
                miscCount+=1
                if documentID not in documentsWithNoRecipient:
                    documentsWithNoRecipient.append(documentID)
        else:
            otherMiscDocCount += 1
        #iii+=1
        #if iii > 30:
         #   raise KeyboardInterrupt
            #print("Skipped item",documentID,"as it is not a letter.")
    #print("</profileDesc>")
end = time.time()

In [None]:
print("Processed",otherMiscDocCount+letterCount,"documents.",str(letterCount)+"("+str(round(letterCount/(otherMiscDocCount+letterCount)*100))+"%) were letters addressed to "+str(noOfRecipients)+" recipients, of which "+str(len(addresseesUnique))+" were unique (meaning each person received avg. "+str(round(letterCount/len(addresseesUnique)))+" letters), and",miscCount,"letters without recipients (if this > 0, there's a problem) in",round(end - start,1),"seconds.")
print("The register file had 5443 documents of which 2711 were letters. Munch received",noOfLettersToMunch,"letters according to my script.")
if len(errors_found) > 0:
    i = 0
    print("\n"+str(len(errors_found)),"data warnings and errors, listed as INFO, WARNING, and ERROR in order of severity:")
    for error in errors_found:
        i+=1
        if "201881" in error:
            print(i,error,"\n\tDocument has no author. Registered as \"No author\".")
        elif "301881" in error:
            print(i,error,"\n\tDocument has a specific date type, but does not specify or suggest a year (MM-DD/MM). Document has been given \"undated\" status.")
        elif "301882" in error:
            print(i,error,"\n\tCatastrophic failure in date format or harvesting. The script was not designed for this.")
        elif "30190" in error:
            print(i,error,"\n\tCatastrophic failure in recipient list processing. I don't think the script will run to this point with such an error.")
        elif "20191" in error:
            print(i,error,"\n\tThe recipient is not a person or an organization. Suggests error in reference XMLURI. Defaulted to person.")
        else:
            print("There is an error that is not indexed. :(")
        print("\n")
else:
    print("No warnings or errors found.")
print("Saving to disk.")
start = time.time()
with open(outputfolder+"\CMIF_Unified.xml", "w", encoding="utf-8") as output_file:
    output_file.write(CMIF.prettify())
end = time.time()
print("Prettified CMIF file created in",round(end - start,1),"seconds.")
print("Process complete.")

In [None]:
documentID

#### Debug stuff 

In [None]:
#print(CMIF.prettify())
print("Bad/irregular dates:",documentsWithBadDates,"\nBad/no author:",documentsWithNoAuthor,"\nBad/no recipient:",documentsWithNoRecipient,"\nBad/no target reference:",documentsLackingTargetReference)

In [None]:
with open(glob.glob(inputfolder+"/*.xml")[0], "r", encoding="utf-8") as file: # Open a file
    tei = file.readlines() # Les innholdet som linjer
    tei = "".join(tei) # Linjene blir kombinert i en variabel
soupA = BeautifulSoup(tei, from_encoding="UTF-8") # It is now soup

with open(glob.glob(inputfolder+"/*.xml")[1], "r", encoding="utf-8") as file: # Open a file
    tei = file.readlines() # Les innholdet som linjer
    tei = "".join(tei) # Linjene blir kombinert i en variabel
soupB = BeautifulSoup(tei, from_encoding="UTF-8") # It is now soup

In [None]:
doc=soupA.find("div", {"xml:id" : "No-MM_K5796"})
#doc.find("item", {"n":"sender"}).findChildren(True, recursive=True)
authorNameList = doc.find("item", {"n":"sender"}).findChildren(True, recursive=True)
ji=0
for name in authorNameList:
    authorName = authorNameList[ji]
    #authorName = document.find("item", {"n":"sender"})
    try:
        targetRef = authorName['target']
    except:
        targetRef = "NONE"
    print(ji,authorName,targetRef)
    ji+=1

# MM_K3421 viser en instans av at det er kodet inn item n@sender og ref, men mangler innhold i begge. Dårlig. Har jo ikke recipient helle

# MM_N3734 er notater på MM_K4982. MM_K4982 opptrer ikke som objekt i XML-filen jeg har fått. Hmm.


In [None]:
tagsAttrs = [] # New list
for x in soup.findAll(): # For every tag in the soup
    tag = str(x.name) # Assign name of tag to var tag
    for attribute in x.attrs: # For every attribute belonging to the tag
        tag = tag+" @"+attribute # Append attribute to tag with " @" as separator - results in combination
    if tag not in tagsAttrs: # If this particular combination of tag/attribute(s) has not been seen previously
        tagsAttrs.append(tag) # Register it in our list
# Dict with known tag/attribute pairings and understood meanings
dict = {
    "tei":"The TEI element - is where our file actually begins.",
    "teiheader":"The TEI header contains metadata (titleStmt, publicationStmt, sourceDesc...).",
    "p":"P is a paragraph. This is used in the TEIheader to contain the actual strings for publication & source desc. And a single, random </p> element later.",
    "body":"Body is used as a sub-element of <text> to contain all the metadata for all letters. I am personally offended by this practice. BS4 adds one, too.",
    "text":"Text appears to be a wrapper for the body tag, which contains all the texts' metadata.",
    "div":"Div, with @xml:id, is used to contain the metadata of a single letter.",
    "date":"Date is a date element. It seems to have the @when attribute very often, as well as enclosed text. Often has @type(year/fromTo, etc.)",
    "table":"Table is the primary data structure in which information about each letter is stored. This is a *table*.",
    "row":"Row is a sub-element of the table element. It defines a new X-axis in a table.",
    "seg":"Seg appears to be some kind of ID attached to each letter. The ID is used as an @xml:id attribute in div, and the element appears in references to other letters.",
    "cell":"Cell is a sub-element of the table element also. A single cell appears to be an entry into a row element.",
    "ref":"Ref appears to contain references to other XML items.",
    "item":"Item is a generic element that has multiple @attributes, such as owner, owner signature, author, paper type... This is evidently a very important tag.",
    "list":"List is a list. Often, the list only has one item. The list is used as description tag, containing other lists, and describes anything between dates to material type.",
    "html":"The HTML tag can be ignored. BS4 adds this.",
    "filedesc":"Filedesc contains title, publication, source statements.",
    "sourcedesc":"Sourcedesc describes the source of the whole document.",
    "publicationstmt":"Publication statement for the whole document.",
    "title":"Title for the whole document.",
    "titlestmt":"Titlestmt is a wrapper for the title tag (whole document).",
    "div @xml:id":"Div has an attribute @xml:id. This describes the unique ID of the item in question.",
    "list @type":"List's @type attribute describes whether the list is wrapped around an object/physical description, a date, or other category.",
    "item @n":"Item's @n attribute describes role, library sorting, language, measures, dated, notes and so on. Very... multipurpose.",
    "tei @xmlns @xml:id":"tei @xmlns @xml:id is functionally identical to TEI tag. Just the one.",
    "date @type @from @to":"date @type @from @to describes the sequence type=fromTo, from, to. A date range.",
    "ref @target":"ref's @target attribute describes a URL to another XML.",
    "date @type @when":"Date with attributes type and when. Single date/year.",
    "ref @type @target":"Seems to contain URL to eMunch's web pages for a 'Read More' function.",
    "date @type @from":"date @type @from is an open-ended date.",
    "date @type":"Caution: date @type is a date with just a type. The date itself might be enclosed...? Potentially misleading. Investigate.",
    "ref @target @n":"ref @target @n - like ref @target, but @n tends to be the name of an institution or so.",
    "row @n":"row @n describes parts of the text. Inventory number, paper type, etc.",
    "ref @type @target @n":"ref @type @target @n - Working off of previous information, I'll infer that ref @type @target @n describes a Read More, with URL, with name."
}
print("Listing all unique tags and attribute combinations found with mapped, understood meanings.\n")
tagsAttrs.sort() # We do a little sorting
for x in tagsAttrs: # For every tag/attr combination registered
    if x in dict: # If our dict has the combo
        if "@" in x: # If there's an attribute involved
            print("ATTR ["+str(x)+"]",dict[x]) # Print with attribute focus
        else: # If there is no attribute involved
            print("TAG ["+str(x)+"]",dict[x]) # Print with tag focus
    else: # If our dict does not have the combo
        print("\n"+str(x),"has no description. What is this?\n") # Print error
comments = soup.find_all(string=lambda text: isinstance(text, Comment)) # Find all comments in soup
if comments: # If there are comments
    n = len(comments) # Check how many comments
    print("\n> Detected",n,"comments (<!-- -->, etc). These should be eradicated before tag extraction.") # Print message
else: # If there are no comments
    print("\n> There are no (0) comments to worry about in this document.") # There are no comments

We have notes as well as letters. The notes generally do not have a item @recipient tag, while the letters generally do.

Every div has an xml:id, and an enclosed ID.

Every item then has a list with items in it.

#### Debug stuff - Types of attributes
There's a whole lot of item *n* tags. What are they? Let's find out. The following extracts list and item tags with unique attribute texts. We have to filter out a loooot of tags that're IDs, dates etc. And look - we got cells, too!

In [None]:
itemNs = []
itemXmlIds = []
itemDates = []
itemTargets = []
lists = itemNs,itemXmlIds,itemDates,itemTargets
for x in soup.findAll(True):
    name = x.name
    if "list" in name:
        children = x.findChildren(True, recursive=True)
        i=0
        for child in children:
            if len(child.attrs) == 0:
                print("Child of",name,i,child.name,"\n")
            else:
                print("Child of",name,i,child.name,child.attrs,"\n")
            i+=1
    for i in x.attrs:
        attribute = i
        value = x.attrs[i]
        try:
            contents = x.contents[0]
            fullTag = str(name)+" @"+str(attribute)+" = "+str(value)+" "+str(contents)
        except:
            fullTag = str(name)+" @"+str(attribute)+ "= "+str(value)
        if "@xml:id" in fullTag:
            if fullTag not in itemXmlIds:
                itemXmlIds.append(fullTag)
        elif "date" in fullTag:
            if fullTag not in itemDates:
                itemDates.append(fullTag)
        elif "@target" in fullTag:
            if fullTag not in itemTargets:
                itemTargets.append(fullTag)   
        else:
            if fullTag not in itemNs:
                itemNs.append(fullTag)
#for x in lists:
#    x.sort()
#itemNs

### CMIF

https://correspsearch.net/en/documentation.html

/correspAction/@type == correspAction element with attribute type="xyz"

/correspAction/persName == correspAction element with persName child element

@X == attribute of element

*Each letter, postcard - document - that is to be described features its own **correspDesc element**. *There are as many correspDescs as there are items. A particular correspDesc element in CMI format is more restrictive and reduced with regard to its vocabulary than the TEI Guidlines generally allow. This enables interchange between the respective TEI documents.*

for each in letters:
    create correspDesc wrapper
    
<correspDesc>
    <correspAction type="sent">
        <persName ref="VIAFetc url">NAME</>
        <placeName ref="Geonames url">NAME</>
    <correspAction type="received">
        <persName ref="url">NAME</>
        <placeName ref="Geonames url">NAME</>

### Mapping tags
*Italics* == Tag is category/folder only, does not contain text in itself

#### TEI-Header (metadata)
1. *Titlestmt* {Title, Editor(email)}
2. *Publicationstmt* {*Publisher* (Ref @target), idno@url, date@when, *Availability*(licence@target)}
3. *Sourcedesc* {Bibl@type@xml:id} - type="online" xml:id="cmifUid"

The header mostly features direct correlation, or items where the program will directly inject new information.

Now, because nothing is easy, the example file is just all TEI header including the letters it wants to describe. There is a body tag with a random \<p/>, which just serves absolutely no purpose. Why?

#### "profileDesc" (data)
1. correspDesc @key @ref @source {correspAction @type (persname @ref, placename @ref, *date @when*), correspAction @type (persname @ref, placename @ref, *date @when*)}

Dates need to be YYYY-MM-DD, dropping DD and/or MM if required. Unknown dates should be skipped as per CMIF documentation. 


# Replaced, functional(?) code

# Limit workspace to individual div (document) here.
profileDescElement = CMIF.find('profiledesc') # Target correspondence wrapper
# For each Div element with an XML:ID (should be each document)
for document in soup.findAll("div", {"xml:id":True}):
    # Get the document ID from the <div> element.
    # Look for the document type assignment.
    documentType = document.find("list", {"type" : "objectType"}).findChild(True, recursive=True)#.attrs['n']
    if "brev" in documentType or "letter" in documentType: # Checks if the words "letter" or "brev" appear in the type
        # This code applies to letters as directed by the data type.
        documentID = list(document.attrs.values())[0]
        #print(documentType)
        #print("DEBUG Checking",documentID)
        # Check if the document has more than 0 recipients. If there are no recipients, there is no correspAction required.
        recipient = document.find("item", {"n":"recipient"})

        # Check if the document has an author.
        authorName = document.find("item", {"n":"author"})
        if authorName:
            #print(authorName)
            authorName = authorName.contents[0]
        else:
            authorName = "No author"
            print("WARNING:",documentID,"suffered code 201881 no author found!")
            errors_found.append("INFO 201881 in "+str(documentID))
        if authorName == "Edvard Munch":
            authorID = "https://viaf.org/viaf/61624802/"
        else:
            authorID = "Add author ID mechanism."

        # Attempt to divine the date or date range of the document. Assumes that each document only has 1 date (or 1 range).
        isDocumentUndated = document.find("item", {"n":"undated"})
        if isDocumentUndated:
            date = "s.d."
            datetype = "none"
        else:
            isDocumentFromTo = document.find("date", {"from":True}) # Does the date element have a from assignment? 
            # Using "from" because PN1350 does not have a fromTo attr despite using fromTo. Uses "from", though. Works fine.
            if isDocumentFromTo: # If it does, and thus has a range (JK, No-MM_T1296 has FROM attr but not a TO attr.)
                doesDocumentHaveToDate = document.find("date", {"to":True})
                if doesDocumentHaveToDate:
                    fromDate = isDocumentFromTo['from'] # Extract 'from' date. 
                    #date = " ".join(date)
                    toDate = isDocumentFromTo['to'] # Extract 'to' date.
                    datetype = "range"
                else:
                    date = isDocumentFromTo['from']
                    datetype = "fromRange"
                
            elif not isDocumentFromTo: # If it doesn't:
                yearSent = document.find("date", {"type":"year","when":True}) # Check for year element
                monthSent = document.find("date", {"type":"month","when":True}) # Check for month element
                daySent = document.find("date", {"type":"day","when":True}) # Check for day element
                if yearSent:
                    datetype = "exact"
                    date = yearSent.attrs["when"]
                    if monthSent: # Only look for a month if there's a year. That 1 letter with just month/day, tho...
                        M = re.sub('[-]', '', monthSent.attrs["when"]) # Strip the random '-' characters in here.
                        date+="-"+str(M) # Join month to year by YYYY-MM.
                        if daySent: # Only applies if there is a month AND a day. No point having a day if you don't have a month.
                            M = re.sub('[-]', '', daySent.attrs["when"]) # Strip the random '-' characters in here, too.
                            date+="-"+str(M) # Join month to year-month by YYYY-MM-DD.
                else:
                    datetype = "none"
                    date = "s.d."
                    print("WARNING:",documentID,"suffered code 301881 - no year found in a specific-year element. Expected in MM_N1071 and MM_N3734.")
                    errors_found.append("INFO 301881 in "+str(documentID))
            else:
                datetype = "Warning 301882"
                print("WARNING:",documentID,"suffered error 301882 - catastrophic date error")
                errors_found.append("CRITICAL ERROR 301882 in "+str(documentID))
        
        # Construct CMIF author ("sent") element
        correspDescElement = soup.new_tag("correspDesc", attrs={"key":str(documentID), "ref":"https://www.emunch.no/HYBRID"+str(documentID)+".xhtml", "source":cmifUid})
        profileDescElement.append(correspDescElement)

        targetElementCorrespDesc = CMIF.find("correspDesc", attrs={"key":str(documentID)})
        correspActionElement = soup.new_tag("correspAction", attrs={'type':'sent'})
        targetElementCorrespDesc.append(correspActionElement)
        correspActionTarget = targetElementCorrespDesc.findChild("correspAction",attrs={"type": "sent"}, recursive=False)
        persNameElement = soup.new_tag("persName", attrs={"ref":authorID})
        persNameElement.string = str(authorName)
        
        correspActionTarget.append(persNameElement)
        
        if datetype == "exact":
            dateSentElement = soup.new_tag("date", attrs={"when":date})
            #print(datetype,date)
        elif datetype == "range":
            dateSentElement = soup.new_tag("date", attrs={"from":fromDate,"to":toDate})
            #print(datetype,fromDate,toDate)
        elif datetype == "fromRange":
            dateSentElement = soup.new_tag("date", attrs={"from":fromDate})
            #print(datetype,fromDate)
        elif datetype == "none":
            #print("> NO DATE!",documentID)
            pass
        else:
            print("ERROR 2839 - Unrecognized datetype!")
            errors_found.append("2839")
        if datetype == "none":
            pass
        else:
            # Append date element to correspAction @sent
            correspActionTarget.append(dateSentElement)


        
        if recipient: # If there are more than 0 recipients:
            letterCount += 1
            i=0
            recipientList = recipient.findChildren(True) # Get ALL children of the recipient item element. Might be 2+!
            for each in recipientList: # For every recipient:
                recipientName = str(each.contents[0]) # Assign a name
                noOfRecipients += 1
                if recipientName not in addresseesUnique:
                    addresseesUnique.append(recipientName)
                recipientID = recipientList[i].attrs["target"] # Assign an ID

                if "institution" in recipientID:
                    recipientType = "orgName"
                elif "person" in recipientID:
                    recipientType = "persName"
                else:
                    print("WARNING:",documentID,"suffered error 20191. Defaulted to person.")
                    recipientType = "persName"
                    errors_found.append("WARNING 20191 in "+str(documentID))

                correspActionElement = soup.new_tag("correspAction", attrs={'type':'received'})
                targetElementCorrespDesc.append(correspActionElement)
                correspActionTarget = targetElementCorrespDesc.findChildren("correspAction",attrs={"type": "received"}, recursive=False)

                if recipientType == "persName":
                    persNameElement = soup.new_tag("persName", attrs={"ref":recipientID})
                elif recipientType == "orgName":
                    persNameElement = soup.new_tag("orgName", attrs={"ref":recipientID})

                persNameElement.string = str(recipientName)
                correspActionTarget[i].append(persNameElement)
                i+=1
        else: # If document does not have a recipient, what do we do?
            miscCount+=1
    else:
        otherMiscDocCount += 1
        #print("Skipped item",documentID,"as it is not a letter.")
#print("</profileDesc>")
end = time.time()