In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
import logging
import os
import re
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def gen_docs(corpus, lemmatize, rm_stops, testing):
    # TODO: Replace accented chars with un
    punkt = PunktLanguageVars()
    punctuation = [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:20]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:   
        with open(filepath) as f:
            text_raw = f.read()
        text_cleaned = text_cleaner(text_raw)
        sent_tokens = sent_tokenizer.tokenize_sentences(text_cleaned)
        doc_sentences = []
        for sentence in sent_tokens:
            sentence = punkt.word_tokenize(sentence)
            sentence = [s.lower() for s in sentence]

            # rm junk chars
            _sentence = []
            for word in sentence:
                word = [c for c in word if c not in punctuation]
                _sentence.append(''.join(word))
            sentence = _sentence


            sentence = [w for w in sentence if w]
            if stops:
                sentence = [w for w in sentence if w not in stops]
            #sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence
            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence != []:
                doc_sentences.append(sentence)
        if doc_sentences != []:
            yield doc_sentences

In [None]:
rm_junk = re.compile(r'†|\(|\)|]')

comp_accent = re.compile(b'\xcc\x81')

import builtins

new_doc = []
for doc in gen_docs('phi5', lemmatize=False, rm_stops=False, testing=False):
    for sentence in doc:
        print(sentence)
        new_sentence = []
        for word in sentence:
            new_word = ''
            for char in word:
                if builtins.bytes(char, 'utf-8') == b'\xcc\x81':
                    pass
                else:
                    new_word += char
            new_sentence.append(new_word)
        print(new_sentence)
        input()

['edepól', 'paternam', 'quí', 'comest', 'pecúniam', 'quíd', 'ita']
['edepol', 'paternam', 'qui', 'comest', 'pecuniam', 'quid', 'ita']

['quia', 'enim', 'répuerascis', 'fúgitas', 'personás', 'pater']
['quia', 'enim', 'repuerascis', 'fugitas', 'personas', 'pater']

['licétne', 'leno', 'dúo', 'uerbis', 'etiám']
['licetne', 'leno', 'duo', 'uerbis', 'etiam']

['primo', 'et', 'postrémo', 'quod', 'tú', 'mi', 'gnate', 'quaéso', 'ut', 'in', 'pectús', 'tuum', 'demíttas', 'tamquam', 'in', 'físcinam', 'uindémitor']
['primo', 'et', 'postremo', 'quod', 'tu', 'mi', 'gnate', 'quaeso', 'ut', 'in', 'pectus', 'tuum', 'demittas', 'tamquam', 'in', 'fiscinam', 'uindemitor']

['áge', 'nunc', 'quando', 'rhétoricasti', 'sátis', 'responde', 'quód', 'rogo']
['age', 'nunc', 'quando', 'rhetoricasti', 'satis', 'responde', 'quod', 'rogo']

['quód', 'editis', 'nihil', 'ést']
['quod', 'editis', 'nihil', 'est']


In [None]:
            for char in word:
                #! the accent comes in bytes: b'\xcc\x81'
                print(builtins.bytes(char, 'utf-8'))
                if char in accutes_map:
                    print(word)
                    new_char = accutes_map[char]
                    new_word += new_char
                elif builtins.bytes(char, 'utf-8') == b'\xcc\x81':
                    print('********')
                    
                    print('********')
                else:
                    new_word += char
            new_sentence.append(new_word)
        print(new_sentence)
        input()
    new_doc.append(new_sentence)