# Menota-extract

Converts `menota/xml/*xml` (manually downloaded for now) into plaintext at `menota/dipl/*txt` and `menota/norm/*txt`.

In [1]:
import os,glob,re,json
from lxml import etree
from pathlib import Path
from urllib.request import urlretrieve

# Generate menota-entities.json:
if not(os.path.isfile('menota/menota-entities.json')):
    if not(os.path.isfile('menota/menota-entities.txt')):
        urlretrieve('https://www.menota.org/menota-entities.txt', 'menota/menota-entities.txt')
    entities = dict()
    with open('menota/menota-entities.txt') as entities_file:
        entities_raw = entities_file.read()
    entities_hits = re.findall("<!ENTITY[^>]*", entities_raw)
    for hit in entities_hits:
        rubble = hit.split()
        entities['&' + rubble[1] + ';'] = rubble[2].strip('"')
    with open('menota/menota-entities.json', 'w') as outfile:
        json.dump(entities, outfile, ensure_ascii=False, indent=4)

with open('menota/menota-entities.json') as entities_file:
    entities = json.load(entities_file)

parser = etree.XMLParser(remove_blank_text=False,resolve_entities=True,load_dtd=True)
ns = {
    'tei': 'http://www.tei-c.org/ns/1.0', 
    'me': 'http://www.menota.org/ns/1.0'}

Path("menota/noent").mkdir(parents=True, exist_ok=True)
Path("menota/dipl").mkdir(parents=True, exist_ok=True)
Path("menota/norm").mkdir(parents=True, exist_ok=True)

# I'm reserving some normalization for downstream (stylometry, tf-idf):
substitutions = {
    'ſ': 's',
    'ᴍ': 'm',
    'ɴ': 'nn',
    'ɢ': 'gg',
    'j': 'i',
    '': 'ú', # &ucurl;
    'ʀ': 'rr',
    '-': '',
    '–': '',
    'ŭ': 'u',
    'ꝼ': 'f',
    'ꞇ': 't',
    'ꝩ': 'u',
    'ı': 'i',
    'ꝇ': 'll',
    'ꝛ': 'r',
    '': 'i', # &jacute;
    '': 'm', # &muncdes;
    '': 'e', # &Euncclose;
    '': 'oc', # &etslash;
    '': 'aa', # &aaligdblac;
    'ƶ': 'z',
    'ꜱ': 'ss', # &sscap;
    # 'k': 'c', # rather than vice versa, because of Latin (e.g. Lucifer)
    'ꝺ': 'd',
    'ꜹ': 'au',
    'ę': 'æ',
    # 'á': 'a',
    # 'ǽ': 'æ',
    # 'é': 'e',
    # 'í': 'i',
    # 'ó': 'o',
    # 'ú': 'u',
    # 'ý': 'y',
    # 'ǿ': 'ø',
    '': 'ø', # &oslashogon;
    '': 'ǫ', # &ocurl; this normalization is not everywhere correct, 
              #but we need to coordinate it with how I treat Unger's.
    # 'v': 'u',
    # 'ð': 'þ'
}

def normalize(txt):
    for k,v in substitutions.items():
        txt = txt.replace(k, v)
    return txt

In [2]:
# Create a duplicate Menota corpus without custom entities,
# because lxml doesn't do SYSTEM entity files:
for infile in glob.glob('menota/xml/*xml'):
    with open(infile) as xml_doc:
        doc = xml_doc.read()
    for k,v in entities.items():
        doc = doc.replace(k, v)
    outfile = infile.replace('xml', 'noent', 1)
    with open(outfile, 'w') as f:
        f.write(doc)


In [3]:
# Now retrieve tokens and output to plaintext file.
# Since two dg4-7 files lack `<me:dipl>` encoding, while the third lacks `<me:facs>`
# and `<me:norm>`, we `tostring()` their `<w>` nodes directly for our diplomatic corpus.
dipl_processed = 0
norm_processed = 0
docs_without_dipl = []
docs_without_norm = []
for infile in glob.glob('menota/noent/*xml'):
    dipl_tokens = []
    norm_tokens = []
    tree = etree.parse(infile, parser=parser)
    root = tree.getroot()
    text = root.find('.//{http://www.tei-c.org/ns/1.0}text')
    delenda = ['sic', 'note', 'rdg']
    for category in delenda:
        query = './/{http://www.tei-c.org/ns/1.0}' + category
        for hit in text.findall(query):
              hit.getparent().remove(hit)
    for word in text.findall('.//{http://www.tei-c.org/ns/1.0}w'):
        if 'dg4-7' in infile:
            form = normalize(etree.tostring(word, method='text', encoding='unicode').lower().replace(' ', '').replace('\t', '').replace('\n', ''))
            dipl_tokens.append(form)
        else:
            dipl = word.find('.//{http://www.menota.org/ns/1.0}dipl')
            if dipl is not None:
                form = normalize(etree.tostring(dipl, method='text', encoding='unicode').lower().replace(' ', '').replace('\t', '').replace('\n', ''))
                dipl_tokens.append(form)
        norm = word.find('.//{http://www.menota.org/ns/1.0}norm')
        if norm is not None:
            form = etree.tostring(norm, method='text', encoding='unicode').lower().replace(' ', '').replace('\t', '').replace('\n', '')
            norm_tokens.append(form)
    if len(dipl_tokens) > 0:
        dipl_output_string = ' '.join(dipl_tokens)
        outfile = infile.replace('noent', 'dipl', 1).replace('.xml', '.txt')
        with open(outfile, 'w') as f:
            f.write(dipl_output_string)
        dipl_processed += 1
    else:
        docs_without_dipl.append(os.path.basename(infile))
    if len(norm_tokens) > 0:
        norm_output_string = ' '.join(norm_tokens)
        outfile = infile.replace('noent', 'norm', 1).replace('.xml', '.txt')
        with open(outfile, 'w') as f:
            f.write(norm_output_string)
        norm_processed += 1
    else:
        docs_without_norm.append(os.path.basename(infile))

print(f"Processed {str(dipl_processed)} diplomatic transcriptions and {str(norm_processed)} normalized transcriptions.")
if len(docs_without_dipl) > 0:
    print(f"Ignored {len(docs_without_dipl)} XML documents lacking diplomatic transcriptions:")
    for i in docs_without_dipl:
        print(f"• {i}")

if len(docs_without_norm) > 0:
    print(f"Ignored {len(docs_without_norm)} XML documents lacking normalized transcriptions:")
    for i in docs_without_norm:
        print(f"• {i}")

Processed 86 diplomatic transcriptions and 64 normalized transcriptions.
Ignored 5 XML documents lacking diplomatic transcriptions:
• holmD4_various.xml
• holmA10_religiosa_texter.xml
• holmA49_barlaams_saga.xml
• am60_kristinrettir.xml
• holmA80_bonbok.xml
Ignored 27 XML documents lacking normalized transcriptions:
• holmPerg30_landslog.xml
• am242_codex_wormianus.xml
• am305_landslog.xml
• am56_landslog.xml
• dg8II_olafs_saga.xml
• am35_heimskringla1.xml
• am302_landslog.xml
• am243balpha_konungs_skuggsja.xml
• am28_codex_runicus.xml
• holmD4_various.xml
• dg4-7_eliss_saga.xml
• dg4-7_strengleikar.xml
• skbA120_marys_complaint.xml
• am78_kristinrettir.xml
• holmPerg6_barlaams_saga.xml
• am619_norwegian_homily_book.xml
• dg4-7_pamphilius_saga.xml
• holmA10_religiosa_texter.xml
• am63_heimskringla3.xml
• holmA49_barlaams_saga.xml
• am544_voluspa.xml
• am36_heimskringla2.xml
• dg8I_landslog.xml
• am60_kristinrettir.xml
• holmPerg17_thomass_saga.xml
• am178_thidreks_saga.xml
• holmA80_bo