In [2]:
import datetime as dt
import os

from cltk.tokenize.sentence import TokenizeSentence
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author

# Try 1, with full diacritics, no lemmatization

## Make clean plaintext version of text

In [2]:
plain_fp = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')

In [12]:
t0 = dt.datetime.utcnow()

for file in assemble_tlg_author_filepaths():
    with open(file) as fo:
        text = fo.read()
    text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=False)
    file_name = os.path.split(file)[1]
    new_fp = os.path.join(plain_fp, file_name)
    with open(new_fp, 'w') as fo:
        fo.write(text)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:04:56.467069


## Make epithet files

In [3]:
plain_fp = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')
if not os.path.isdir(plain_fp):
    print('Process first with `tlg_plaintext_cleanup()`.')
    raise

cnn_dir = os.path.expanduser('~/cltk_data/user_data/cnn/cnn_frames')
try:
    os.makedirs(cnn_dir)
except FileExistsError:
    pass

In [4]:
def normalize_epithet(epithet):
    return epithet.replace('/-ae', '').replace(' ', '_').lower()

In [5]:
tokenizer = TokenizeSentence('greek')

In [8]:
t0 = dt.datetime.utcnow()

for file_name in os.listdir(plain_fp):
    plain_fp_file = os.path.join(plain_fp, file_name)
    _id = file_name[3:-4]
    
    epithet = get_epithet_of_author(_id)
    if epithet is None:
        continue
    epi_normal = normalize_epithet(epithet)
    epithet_file = os.path.join(cnn_dir, epi_normal + '.txt')
    
    with open(plain_fp_file) as fo:
        original_text = fo.read()
    sentence_tokens = tokenizer.tokenize_sentences(original_text)[:4]
    sentence_tokens = [sent for sent in sentence_tokens if len(sent) > 25]
    if len(sentence_tokens) < 3:
        continue
    sentence_newlines = '\n'.join(sentence_tokens)
    
    with open(epithet_file, 'a+') as fo:
        fo.write(sentence_newlines)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

Size of historici: 247445
Size of historici: 249446
Size of historici: 250884
Size of historici: 252505
Size of historici: 253565
Size of historici: 256563
Size of historici: 257091
Size of historici: 258569
Size of historici: 259208
Size of historici: 259936
Size of historici: 260404
Size of historici: 262707
Size of historici: 264187
Size of historici: 266073
Size of historici: 266967
Size of historici: 268197
Size of historici: 269564
Size of historici: 269888
Size of historici: 271721
Size of historici: 273352
Size of historici: 273854
Size of historici: 274692
Size of historici: 275541
Size of historici: 276370
Size of historici: 277500
Size of historici: 280241
Size of historici: 281253
Size of historici: 283493
Size of historici: 285693
Size of historici: 286954
Size of historici: 288285
Size of historici: 289916
Size of historici: 291336
Size of historici: 293259
Size of historici: 295704
Size of historici: 298452
Size of historici: 299705
Size of historici: 301368
Size of hist

## Run CNN

See my version of someone's modified project, tested to classify Historians vs Philosophers: <https://github.com/kylepjohnson/cnn-text-classification-tf>.

Results of `./train.py`:

```
2016-10-16T19:34:33.239756: step 1593, loss 0.0105471, acc 1
2016-10-16T19:34:34.130735: step 1594, loss 0.000387129, acc 1
2016-10-16T19:34:35.050536: step 1595, loss 0.0380518, acc 0.984375
2016-10-16T19:34:36.104368: step 1596, loss 0.00193451, acc 1
2016-10-16T19:34:36.949230: step 1597, loss 0.00110304, acc 1
2016-10-16T19:34:38.058944: step 1598, loss 0.0035336, acc 1
2016-10-16T19:34:39.078174: step 1599, loss 0.000138309, acc 1
2016-10-16T19:34:39.163573: step 1600, loss 1.88347e-05, acc 1

Evaluation:
2016-10-16T19:34:39.803649: step 1600, loss 0.791714, acc 0.609023

Saved model checkpoint to /root/cnn-text-classification-tf/runs/1476645213/checkpoints/model-1600
```

# Try 2, with full diacritics, no lemmatization

Various ways to do this. I'm going to just read the contents of `~/cltk_data/user_data/cnn/cnn_frames` and make a new dir for the contents, only lemmatized and without any periods.

In [15]:
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')

In [17]:
dir_orig = os.path.expanduser('~/cltk_data/user_data/cnn/cnn_frames')
dir_new = os.path.expanduser('~/cltk_data/user_data/cnn/cnn_frames_lemmatized')
os.mkdir(dir_new)

In [19]:
for file in os.listdir(dir_orig):
    fp_orig = os.path.join(dir_orig, file)
    with open(fp_orig) as fo:
        text_orig = fo.read()
    text_new_list = lemmatizer.lemmatize(text_orig)
    print(text_new_list[:50])
    
    text_new_str = ' '.join(text_new_list)
    fp_new = os.path.join(dir_new, file)
#     with open(fp_new, 'w') as fo:
#         fo.write(text_new_str)

['ὁ', 'λόγου', 'εἰς', 'εἶδος', 'μεριζομένου', 'δύο', 'τὸ', 'μέν', 'εἰμί', 'κυριολογία', 'τὸ', 'δὲ', 'σχῆμα', 'καὶ', 'τρόπος.', 'κυριολογία', 'μὲν', 'οὖν', 'εἰμί', 'ὁ', 'ὁ', 'κατὰ', 'φύσιν', 'λέξεων', 'τεύχω', 'φράσις', 'τότε', 'γὰρ', 'οἰκείως', 'προφέροιτο', 'ἀνά', 'τις', 'ὅταν', 'μὴ', 'παρατρέπω', 'ὁ', 'κυρίως', 'λεγομένου', 'οἷος', 'πὺξ', 'μὲν', 'ἐνίκησα', 'Κλυτομήδεα', 'Φαίνοπος', 'υἱόν', 'ἀνακαίω', 'δὲ', 'πάλην', 'Πλευρώνιον', 'ὅς']
['κατὰ', 'χειρὸς', 'δ', 'ἤλιθ', 'ὕδωρ', 'ἁπαλὸς', 'παιδίσκος', 'ἐν', 'ἀργυρέαι', 'πρόχωι', 'φορέων', 'ἐπέχευεν', 'εἶτ', 'φέρω', 'στέφανον', 'λεπτός', 'ἀπὸ', 'μυρτίδος', 'εὐ', 'γνήτων', 'κλαδέων', 'δισύναπτον.', 'εἰς', 'δ', 'φέρω', 'διπλόοι', 'παῖς', 'λιπαρῶπα', 'τράπεζαν', 'ἄμμ', 'ἑτέραν', 'δ', 'ἑτέροις', 'ἄλλος', 'δ', 'ἑτέραν', 'μέχρις', 'ἵημι', 'πλήρωσαν', 'οἶκος', 'ταὶ', 'δὲ', 'πρὸς', 'ὑψιλύχνους', 'στίλβω', 'αὐγὰς', 'εὐστέφανοι', 'λεκάναις', 'παροψίσι', 'τ']
['ΕΞΗΓΗΣΙΣ', 'ΕΙΣ', 'ΤΑΣ', 'ΙΔΕΑΣ', 'τις', 'ΕΡΜΟΓΕΝΟΥΣ', 'ΑΠΟ', 'ΦΩΝΗΣ', 'ΙΩΑΝΝΟΥ', 'ΦΙΛ