# CMS Latin Exam Lemmatizer

To do:

- biggest issue right now is that Latin WordNet's `lemmatize/` is defunct. Probably solve by cloning https://github.com/latinwordnet/latinwordnet-archive . But if it comes back online probably `from latinwordnet import LatinWordNet()` to save you some syntax.
- other biggest issue is that I have no way of selecting the correct (usually the most frequent) lexeme from among the lemmatizer's hits, probably solve by querying some online frequency database. But ideally I would address that after regaining access to Latin WordNet.
- not sure why it's assigning `ymago` when my manual list clearly refers that form to `imago`
- how best to address the fact that at least one of my lemmatizers assigns infinitives while others don't? Probably run all NLTK results through Latin WordNet, once it comes back online.

In [1]:
import os,glob,re,json,requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from CustomLatinBackoffLemmatizer import CustomLatinBackoffLemmatizer
lem = CustomLatinBackoffLemmatizer()
session = requests.Session()
retry = Retry(connect=10, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [2]:
normalization = [
    ("ę", "ae"),
    ("æ", "ae"),
    ("j", "i")
    ]

def normalize(string):
    # In addition to normalizing the above set, we'll print consonantal <u> as <v>:
    # TO DO: ensure reuuixit is correctly normalized without affecting Erkenuualdus
    for (k,v) in normalization:
        string = string.replace(k,v)
        string = re.sub('(?<=[aeiouy])u(?=[aeiouy])', 'v', string)
        string = re.sub('^u(?=[aeiouy])', 'v', string)
    return string    

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

# Strip an exam of its preamble and store exam identifier, passages, and their titles in a dict:
def isolate(doc):
    raw = open(doc).read()
    startpos = re.search('\n1\.\s*', raw).start()
    content = raw[startpos:]
    titles = []
    title_start_pos = []
    title_end_pos = []
    for i in range(1,5):
        # Store titles 1--4 in a list, and also remember their start and end indices;
        # those tell us where the text of the passages begins and ends, too:
        title = re.search(rf'(?<=\n){i}\.\ .*(?=\n)', content)
        titles.append(title.group(0))
        title_start_pos.append(title.start())
        title_end_pos.append(title.end())
    # Add a value of -1 to the list of start positions so the following loop works for the final text, too:
    title_start_pos.append(-1)
    # Now store the text of the four passages in a list:
    texts = []
    for i in range(0,4):
        texts.append(content[title_end_pos[i]:title_start_pos[i+1]].strip('\n '))
    exam = {}
    exam['id'] = doc.split('/')[1].removesuffix('.txt')
    exam['titles'] = titles
    exam['passages'] = texts
    return exam

# Run the above function on a corpus and return the result as a list:
def load_corpus(data_set):
    corpus = []
    for file in sorted(glob.glob('exams/{}*txt'.format(data_set))):
        corpus.append(isolate(file))
    return corpus

# Test for the existence of named-entities lists, else generate on the basis of capitalization and punctuation
# (this required me to tweak corpus capitalization in accordance with my particular NER specifications):
def ner_generate(data_sets):
    all_ner_lists = dict()
    for corpus in data_sets:
        if os.path.exists('ner_{}.json'.format(corpus[0])):
            entities = json.load(open('ner_{}.json'.format(corpus[0])))
        else:
            entities = []
            for exam in corpus[1]:
                tokens = word_tokenize(' '.join(exam['passages']))
                counter = 0
                for token in tokens:
                    if token[0].isupper():
                        if tokens[counter-1][-1].isalpha() or tokens[counter-1][-1] == ',':
                            entities.append(normalize(token.lower().strip('.')))
                    counter += 1
            entities = sorted(list(set(entities)))
            with open('ner_{}.json'.format(corpus[0]), 'w') as f:
                json.dump(entities, f)
        all_ner_lists[corpus[0]] = entities
    return all_ner_lists

In [4]:
def lemmatize(corpus):
    if os.path.isfile('manual_ner_{}.txt'.format(corpus[0])):
        ner_supplement = open('manual_ner_{}.txt'.format(corpus[0])).read().splitlines()
    else:
        ner_supplement = []
    if os.path.isfile('lemmata_manual_{}.txt'.format(corpus[0])):
        lemma_file = open('lemmata_manual_{}.txt'.format(corpus[0])).read().splitlines()
        lemma_supplement = dict(line.split(' : ', 1) for line in lemma_file)
    else:
        lemma_supplement = dict()
    cltk_results = {}
    unrecognized = []
    for exam in corpus[1]:
        outfile = 'lemmata/{}.json'.format(exam['id'])
        if not os.path.isfile(outfile):
            lemmatized = []
            for text in exam['passages']:
                identified = []
                tokens = tokenizer.tokenize(text)
                for token in tokens:
                    token = normalize(token.lower())
                    ner_count = 0
                    if token in lemma_supplement:
                        identified.append(lemma_supplement[token])
                    elif token in ner[corpus[0]] or token in ner_supplement:
                        identified.append('NAME')
                        ner_count += 1
                    else:
                        # https://latinwordnet.exeter.ac.uk/lemmatize/{} is defunct!
                        r = session.get('https://latinwordnet.exeter.ac.uk/lemmatize/{}'.format(token))
                        latinwordnet_result = r.json()
                        if len(latinwordnet_result) == 0:
                            if 'pre' in token:
                                # Normalize "pre"-spellings:
                                r = session.get('https://latinwordnet.exeter.ac.uk/lemmatize/{}'.format(token.replace('pre', 'prae')))
                                result = r.json()
                            elif 'y' in token:
                                # Normalize "y"-spellings:
                                r = session.get('https://latinwordnet.exeter.ac.uk/lemmatize/{}'.format(token.replace('y', 'i')))
                                latinwordnet_result = r.json()
                        if len(latinwordnet_result) == 0:
                            cltk_result = lem.lemmatize([token])
                            if cltk_result[0][1] == None:
                                if 'pre' in token:
                                    cltk_result = lem.lemmatize([token.replace('pre', 'prae')])
                            if cltk_result[0][1] == None:
                                if 'y' in token:
                                    cltk_result = lem.lemmatize([token.replace('y', 'i')])
                            if cltk_result[0][1] == None:
                                unrecognized.append(token)
                                identified.append('')
                            else:
                                identified.append(cltk_result[0][1])
                        else:
                            # Normalizing lexical forms because some lemmatizers normalize to consonantal <u>:
                            identified.append(normalize(latinwordnet_result[[0]['lemma']['lemma']).rstrip('0123456789'))
                lemmatized.append(identified)
            print(sorted(list(set(unrecognized))))
            with open(outfile, 'w') as f:
                json.dump(lemmatized, f)
            if os.path.isfile('lemmata/unidentified_{}.json'.format(corpus[0])):
                unidentified = sorted(list(set(unrecognized + json.load(open('lemmata/unidentified_{}.json'.format(corpus[0]))))))
            else:
                unidentified = sorted(list(set(unrecognized)))
            with open('lemmata/unidentified_{}.json'.format(corpus[0]), 'w') as f:
                json.dump(unidentified, f)

In [5]:
data_sets = (('l1', load_corpus('l1')), ('l2', load_corpus('l2')))
ner = ner_generate(data_sets)
lemmatize(data_sets[0]) # set to [0] to lemmatize the Level One set, [1] for Level Two

IndexError: list index out of range