In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from collections import Counter
from itertools import groupby
from nltk.tokenize.punkt import PunktLanguageVars
from operator import itemgetter
import os

In [14]:
def get_corpus_tokens(corpus):
    assert corpus in ['phi5', 'tlg'], print("Corpus must be either 'phi5' or 'tlg'.")
    if corpus == 'phi5':
        files_list = assemble_phi5_author_filepaths()
    elif corpus == 'tlg':
        files_list = assemble_tlg_author_filepaths()
    j = JVReplacer()
    p = PunktLanguageVars()
    all_tokens = []
    files_list = files_list[:3]  # for testing
    for path in files_list:
        tokens = []
        with open(path) as f:
            raw_text = f.read()
        if corpus == 'phi5':
            text = phi5_plaintext_cleanup(raw_text)
        elif corpus == 'tlg':
            text = tlg_plaintext_cleanup(raw_text)
        chars = [chars for chars in text if chars not in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '#', '%', '(', ')', '/', '&', '”']]
        text = ''.join(chars)
        tokens = p.word_tokenize(text.lower())
        tokens = [j.replace(word) for word in tokens]
        all_tokens += tokens
    return all_tokens

# Latin

In [7]:
all_tokens = get_corpus_tokens('phi5')
#print('Total unique tokens:', len(all_tokens))

lemmatizer = LemmaReplacer('latin')
counter = Counter(all_tokens)
mc = counter.most_common(10000)
for pair in mc:
    #print(pair[0] + '\t' + str(pair[1]))
    print(lemmatizer.lemmatize(pair[0])[0] + '\t' + str(pair[1]))

INFO:CLTK:Loading lemmata. This may take a minute.


Total unique tokens: 21870
et	552
in	534
si	320
facio	187
ubi	182
sum1	163
uto	155
cum	152
edo1	147
non	143
benus	131
neo1	130
ad	128
aut	127
qui1	126
p	121
qui1	119
qui1	116
atque	110
is	98
oportet	96
qui1	96
de	96
eo1	92
quis1	81
sic	80
vinum	80
quis1	79
modus	76
ut	71
eo1	70
sum1	68
per	67
postea	67
ito	66
ex	65
eo1	62
neque	61
volo1	61
facio	59
edo1	58
iii	57
hic	56
sum1	56
opus1	55
pro1	53
is	52
omne	52
indo	51
do	49
sero1	47
aqua	46
sum1	45
vel	45
ito	44
eo1	42
oleum	41
queo	40
hic	39
u	37
inter	36
fio	36
addo	35
iiii	33
sed	33
ego	33
dies	32
tum	31
usque	31
x	31
indo	31
terra	30
res	30
oleo1	29
vinum	29
idem	28
loco	28
primus	28
deinde	28
sub	27
sero1	27
ibi	27
bos	27
redeo	26
recingo	26
vinum	26
dum	26
reor	26
sum1	26
aqua	25
ab	25
pono	25
semen	24
tu	24
qui1	24
qui1	23
nam	23
prior	23
facio	23
arbor1	23
sum1	23
vis	23
nitor1	23
is	23
eximo	22
familia	22
ab	22
dolium	22
ego	22
s	21
idem	21
habeo	21
ager	21
terra	21
post	21
facio	21
dolium	21
olea	21
q	21
unde	21
condio	21
sterc

# Greek

In [15]:
all_tokens = get_corpus_tokens('tlg')
#print('Total unique tokens:', len(all_tokens))

lemmatizer = LemmaReplacer('greek')
counter = Counter(all_tokens)
mc = counter.most_common(10000)
for pair in mc:
    #print(pair[0] + '\t' + str(pair[1]))
    print(lemmatizer.lemmatize(pair[0])[0] + '\t' + str(pair[1]))

INFO:CLTK:Loading lemmata. This may take a minute.


καὶ	561
δὲ	307
ὁ	157
ἐσθίειν	93
ἐκ	89
τὸ	70
τὰ	63
ἐν	56
ὁ	49
ἀπέχεσθαι	48
ὁ	42
διὰ	37
τὸν	33
μὴ	33
τὴν	29
τοὺς	27
πάντα	27
ὁ	27
μετρίως	27
ἰχθύων	27
ὁ	24
λαμβάνειν	23
ὅσος	22
ὡς	22
πίνειν	21
ἁρμόζει	21
κρέας	21
τὰς	19
καρύστιος	18
ὁ	18
λαχάνων	18
οἷος	18
χράω1	18
μετὰ	18
ἀπὸ	18
ὀπώρα	17
ἄνω1	16
δι	15
στάχους	14
ἀφροδισιάζειν	13
λουτρὰ	13
γʹ	13
πάντων	12
ξηρός	12
μήτε	12
τε	12
περὶ	11
ἱστορικός	11
χλία	11
μηνὶ	10
ποιέω	10
ηʹ	10
φημί	10
οἰνόω	10
σκοπάω	10
ἠώς	10
γὰρ	10
περιστερόπουλα	10
κρέη	10
ἄστρον	10
οὗτος	9
κατὰ	9
μὲν	9
πάντας	9
χρή	9
κυριεύει	9
προλεχθέντα	9
ψαχνὰ	8
ὀπτά	8
ὑπομνήμασιν	8
τούτῳ	8
μηνός	8
ζέμα	8
πέπερι	8
λαπίνας	8
χρὴ	8
εἰς	8
δίεφθα	8
κιναμώμου	8
ἀρωμάτων	8
ὀσπρίων	8
εἰμί	7
τρυφεροσάρκους	7
περγαμηνὸς	7
οὕτως	7
σὺν	7
ὅλοξ	7
ὁ	7
δ	7
ἐλαίας	7
ζωμοὺς	7
οἶνος	7
κίχλας	7
ὅστις	7
καρυκευτά	7
ὁμοίως	7
ἢ	7
μαίνομαι	7
ἀπέχειν	6
αὐτός	6
κολιάνδρου	6
λευκοὺς	6
ὥσπερ	6
φοίνικας	6
οἰνόω	6
κοδιμέντων	6
χρίσμα	6
ὠιόν	6
πρὸς	6
καρυκεύειν	6
διαλείμματος	6
λευκὰ	6
πεπέρεως	6
κεφάλους	