In [54]:
import datetime as dt
import os

from cltk.tokenize.sentence import TokenizeSentence
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author

# Make clean plaintext version of text

In [2]:
plain_fp = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')

In [12]:
t0 = dt.datetime.utcnow()

for file in assemble_tlg_author_filepaths():
    with open(file) as fo:
        text = fo.read()
    text = tlg_plaintext_cleanup(text, rm_punctuation=True, rm_periods=False)
    file_name = os.path.split(file)[1]
    new_fp = os.path.join(plain_fp, file_name)
    with open(new_fp, 'w') as fo:
        fo.write(text)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:04:56.467069


# Make epithet files

In [51]:
plain_fp = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')
if not os.path.isdir(plain_fp):
    print('Process first with `tlg_plaintext_cleanup()`.')
    raise

cnn_dir = os.path.expanduser('~/cltk_data/user_data/cnn/cnn_frames')
try:
    os.makedirs(cnn_dir)
except FileExistsError:
    pass

In [52]:
def normalize_epithet(epithet):
    return epithet.replace('/-ae', '').replace(' ', '_').lower()

In [55]:
tokenizer = TokenizeSentence('greek')

In [76]:
t0 = dt.datetime.utcnow()

for file_name in os.listdir(plain_fp):
    plain_fp_file = os.path.join(plain_fp, file_name)
    _id = file_name[3:-4]
    
    epithet = get_epithet_of_author(_id)
    if epithet is None:
        continue
    epi_normal = normalize_epithet(epithet)
    epithet_file = os.path.join(cnn_dir, epi_normal + '.txt')
    
    with open(plain_fp_file) as fo:
        original_text = fo.read()
    sentence_tokens = tokenizer.tokenize_sentences(original_text)[:4]
    sentence_tokens = [sent for sent in sentence_tokens if len(sent) > 25]
    if len(sentence_tokens) < 3:
        continue
    sentence_newlines = '\n'.join(sentence_tokens)
    
    if not os.path.isfile(epithet_file):
        with open(epithet_file, 'w') as fo:
            fo.write(sentence_newlines)
    else:
        with open(epithet_file, 'a') as fo:
            fo.write(sentence_newlines)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:03:07.561396


# Run CNN

See modified project from script, tested to classify Historians vs Philosophers

Results of `./train.py`:

```
2016-10-16T19:34:33.239756: step 1593, loss 0.0105471, acc 1
2016-10-16T19:34:34.130735: step 1594, loss 0.000387129, acc 1
2016-10-16T19:34:35.050536: step 1595, loss 0.0380518, acc 0.984375
2016-10-16T19:34:36.104368: step 1596, loss 0.00193451, acc 1
2016-10-16T19:34:36.949230: step 1597, loss 0.00110304, acc 1
2016-10-16T19:34:38.058944: step 1598, loss 0.0035336, acc 1
2016-10-16T19:34:39.078174: step 1599, loss 0.000138309, acc 1
2016-10-16T19:34:39.163573: step 1600, loss 1.88347e-05, acc 1

Evaluation:
2016-10-16T19:34:39.803649: step 1600, loss 0.791714, acc 0.609023

Saved model checkpoint to /root/cnn-text-classification-tf/runs/1476645213/checkpoints/model-1600
```