# Menota-extract

Converts `menota/xml/*xml` (manually downloaded for now) into plaintext at `menota/dipl/*txt` and `menota/norm/*txt`.

In [None]:
import os,glob,re,json
from lxml import etree
from pathlib import Path
from urllib.request import urlretrieve

# Generate menota-entities.json:
if not(os.path.isfile('menota/menota-entities.json')):
    if not(os.path.isfile('menota/menota-entities.txt')):
        urlretrieve('https://www.menota.org/menota-entities.txt', 'menota/menota-entities.txt')
    entities = dict()
    with open('menota/menota-entities.txt') as entities_file:
        entities_raw = entities_file.read()
    entities_hits = re.findall("<!ENTITY[^>]*", entities_raw)
    for hit in entities_hits:
        rubble = hit.split()
        entities['&' + rubble[1] + ';'] = rubble[2].strip('"')
    with open('menota/menota-entities.json', 'w', encoding='utf-8') as outfile:
        json.dump(entities, outfile, ensure_ascii=False, indent=4)

with open('menota/menota-entities.json') as entities_file:
    entities = json.load(entities_file)

parser = etree.XMLParser(remove_blank_text=False,resolve_entities=True,load_dtd=True)
ns = {
    'tei': 'http://www.tei-c.org/ns/1.0', 
    'me': 'http://www.menota.org/ns/1.0'}

Path("menota/noent").mkdir(parents=True, exist_ok=True)
Path("menota/dipl").mkdir(parents=True, exist_ok=True)
Path("menota/norm").mkdir(parents=True, exist_ok=True)

# I'm reserving some normalization for downstream (stylometry, tf-idf):
substitutions = {
    'ſ': 's',
    'ɴ': 'nn',
    'ɢ': 'gg',
    'j': 'i',
    '': 'ú', # &ucurl;
    'ʀ': 'rr',
    '-': '',
    '–': '',
    'ŭ': 'u',
    'ꝼ': 'f',
    'ꞇ': 't',
    'ꝩ': 'u',
    'ı': 'i',
    'ꝇ': 'll',
    'ꝛ': 'r',
    '': 'i', # &jacute;
    '': 'm', # &muncdes;
    '': 'e', # &Euncclose;
    '': 'oc', # &etslash;
    'ƶ': 'z',
    'ꜱ': 'ss', # &sscap;
    # 'k': 'c', # rather than vice versa, because of Latin (e.g. Lucifer)
    'ꝺ': 'd',
    'ꜹ': 'au',
    # 'á': 'a',
    # 'ǽ': 'æ',
    # 'é': 'e',
    # 'í': 'i',
    # 'ó': 'o',
    # 'ú': 'u',
    # 'ý': 'y',
    # 'ǿ': 'ø',
    '': 'ø', # &oslashogon;
    '': 'ǫ', # &ocurl; this normalization is not everywhere correct, 
              #but we need to coordinate it with how I treat Unger's.
    # 'v': 'u',
    # 'ð': 'þ'
}

def normalize(txt):
    for k,v in substitutions.items():
        txt = txt.replace(k, v)
    return txt

In [None]:
# Create a duplicate Menota corpus without custom entities,
# because lxml doesn't do SYSTEM entity files:
for infile in glob.glob('menota/xml/*xml'):
    with open(infile, 'r', encoding='UTF-8') as xml_doc:
        doc = xml_doc.read()
    for k,v in entities.items():
        doc = doc.replace(k, v)
    outfile = infile.replace('xml', 'noent', 1)
    with open(outfile, 'w') as f:
        f.write(doc)


In [3]:
# Now retrieve tokens and output to plaintext file:
for infile in glob.glob('menota/noent/*xml'):
    dipl_tokens = []
    norm_tokens = []
    tree = etree.parse(infile, parser=parser)
    root = tree.getroot()
    text = root.find('.//{http://www.tei-c.org/ns/1.0}text')
    for sic in text.findall('.//{http://www.tei-c.org/ns/1.0}sic'):
                sic.getparent().remove(sic)
    for note in text.findall('.//{http://www.tei-c.org/ns/1.0}note'):
                note.getparent().remove(note)
    for rdg in text.findall('.//{http://www.tei-c.org/ns/1.0}rdg'):
                rdg.getparent().remove(rdg)
    for word in text.findall('.//{http://www.tei-c.org/ns/1.0}w'):
        dipl = word.find('.//{http://www.menota.org/ns/1.0}dipl')
        if dipl is not None:
            form = normalize(etree.tostring(dipl, method='text', encoding='unicode').lower().replace(' ', '').replace('\t', '').replace('\n', ''))
            dipl_tokens.append(form)
        norm = word.find('.//{http://www.menota.org/ns/1.0}norm')
        if norm is not None:
            form = etree.tostring(norm, method='text', encoding='unicode').lower().replace(' ', '').replace('\t', '').replace('\n', '')
            norm_tokens.append(form)
    if len(dipl_tokens) > 0:
        dipl_output_string = ' '.join(dipl_tokens)
        outfile = infile.replace('noent', 'dipl', 1).replace('.xml', '.txt')
        with open(outfile, 'w') as f:
            f.write(dipl_output_string)
    if len(norm_tokens) > 0:
        norm_output_string = ' '.join(norm_tokens)
        outfile = infile.replace('noent', 'norm', 1).replace('.xml', '.txt')
        with open(outfile, 'w') as f:
            f.write(norm_output_string)

    