In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from collections import Counter
from nltk.tokenize.punkt import PunktLanguageVars

In [2]:
phi_list = assemble_phi5_author_filepaths()
p = PunktLanguageVars()
j = JVReplacer()

In [3]:
all_que_tokens = []
all_ne_tokens = []
all_ue_tokens = []
for file in phi_list:
    with open(file) as f:
        r = f.read()
    text = phi5_plaintext_cleanup(r)
    chars = [chars for chars in text if chars not in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '*']]
    text = ''.join(chars)
    tokens = p.word_tokenize(text.lower())
    tokens = [j.replace(word) for word in tokens]
    que_tokens = [word for word in tokens if word[-3:] == 'que']
    ne_tokens = [word for word in tokens if word[-2:] == 'ne']
    ue_tokens = [word for word in tokens if word[-2:] == 'ue' and word[-3:] != 'que']
    all_que_tokens += que_tokens
    all_ne_tokens += ne_tokens
    all_ue_tokens += ue_tokens

## -que
27960 unique words ending in que

In [4]:
print('Number words:', len(all_que_tokens))
print('Number unqiue words:', len(set(all_que_tokens)))
print(all_que_tokens[:30])

Number words: 182368
Number unqiue words: 27960
['namque', 'undique', 'usque', 'namque', 'namque', 'neque', 'limosoque', 'maioresque', 'denique', 'serpyllumque', 'atque', 'atque', 'haedorumque', 'ouiumque', 'atque', 'castaneasque', 'quoque', 'dardaniusque', 'quemque', 'mollique', 'atque', 'uterque', 'namque', 'bisque', 'orpheaque', 'siluasque', 'quocumque', 'namque', 'quoque', 'atque']


In [5]:
counter = Counter(all_que_tokens)
mc = counter.most_common(30000)

In [6]:
for pair in mc:
    print(pair[0] + '\t' + str(pair[1]))

atque	20349
neque	14000
quoque	12658
itaque	5090
usque	2667
denique	2216
quisque	1675
namque	1658
quinque	1476
utique	1325
quaeque	1320
idque	1180
plerumque	1140
utrumque	1116
aeque	1078
undique	1056
iamque	869
utraque	839
cumque	717
eaque	713
ideoque	708
cuique	702
plerique	699
cuiusque	697
utroque	679
utriusque	667
uterque	626
ubique	555
eoque	547
quaecumque	532
utrimque	521
utque	501
quemque	489
inque	485
quodque	461
quique	454
idemque	430
plerisque	397
utrique	383
quocumque	373
eamque	371
eumque	370
seque	353
quicumque	348
itemque	346
pleraque	333
isque	330
quodcumque	323
teque	310
perque	300
quaque	295
quacumque	294
utramque	290
meque	268
quamque	250
maximeque	245
que	244
utrisque	235
eademque	232
quandoque	231
eique	226
ubicumque	226
multaque	216
deque	215
magisque	214
eiusque	204
eodemque	201
dumque	190
ibique	189
plerosque	183
utcumque	182
tuque	182
tamque	181
eosque	178
semperque	177
marique	172
ceterisque	168
quidque	164
multoque	156
hodieque	153
omnesque	149
quibusque	148
om

## -ne
4179

In [7]:
print('Number words:', len(all_ne_tokens))
print('Number unqiue words:', len(set(all_ne_tokens)))
print(all_ne_tokens[:30])

Number words: 94747
Number unqiue words: 4179
['tegmine', 'pone', 'ordine', 'nonne', 'nonne', 'ne', 'bene', 'lanugine', 'ne', 'ne', 'ne', 'uicine', 'bene', 'flumine', 'formidine', 'numine', 'desine', 'omne', 'diuine', 'gramine', 'carmine', 'harundine', 'omne', 'inane', 'omne', 'carmine', 'ne', 'harundine', 'ordine', 'ne']


In [8]:
counter = Counter(all_ne_tokens)
mc = counter.most_common(30000)

In [9]:
for pair in mc:
    print(pair[0] + '\t' + str(pair[1]))

ne	17403
sine	8675
nomine	4131
bene	3753
ratione	2391
sane	2115
omne	1959
sanguine	1405
paene	1314
condicione	1266
plane	1122
ordine	933
actione	890
oratione	887
homine	768
sermone	634
magnitudine	603
nonne	586
agmine	552
mane	531
commune	525
crimine	503
certamine	499
consuetudine	456
fine	441
contione	438
multitudine	434
regione	429
flumine	416
carmine	410
exceptione	398
semine	392
igne	390
religione	375
possessione	365
lumine	364
latine	312
appellatione	291
opinione	289
numine	284
discrimine	276
stipulatione	262
imagine	255
similitudine	254
pone	251
limine	250
impune	223
fulmine	219
inane	218
portione	218
origine	210
insigne	197
contentione	195
amne	194
scipione	191
benigne	190
obsidione	189
formidine	181
quaestione	178
cognomine	177
suspicione	175
ualetudine	171
turbine	168
cogitatione	168
domine	164
uolumine	164
statione	160
carne	158
necne	155
legione	149
occasione	148
cupidine	146
libidine	145
uirgine	143
inpune	143
significatione	137
carthagine	136
cicerone	134
altitudine	134
ad

## -ue
30404

In [10]:
print('Number words:', len(all_ue_tokens))
print('Number unqiue words:', len(set(all_ue_tokens)))
print(all_ue_tokens[:30])

Number words: 14352
Number unqiue words: 2444
['adsidue', 'ioue', 'suaue', 'faue', 'suaue', 'parue', 'parue', 'siue', 'siue', 'siue', 'siue', 'siue', 'adnue', 'pingue', 'neue', 'siue', 'siue', 'rapidiue', 'graue', 'dilue', 'salue', 'accliue', 'neue', 'neue', 'neue', 'neue', 'aliumue', 'praecipue', 'adsidue', 'adsidue']


In [11]:
counter = Counter(all_ue_tokens)
mc = counter.most_common(30000)

In [12]:
for pair in mc:
    print(pair[0] + '\t' + str(pair[1]))

siue	4851
praecipue	885
neue	503
ioue	422
graue	377
breue	217
leue	215
caue	209
salue	208
naue	188
adsidue	143
niue	114
quidue	100
tenue	91
assidue	79
pingue	78
quiue	74
strenue	71
uiue	69
ciue	67
abusiue	64
suaue	63
boue	59
quoue	54
quaeue	51
ue	50
ungue	47
solue	46
aue	45
aliudue	42
pluresue	42
saeue	39
tempestiue	38
minusue	37
faue	36
promiscue	35
angue	35
aliaue	33
intempestiue	32
ambigue	30
noue	30
daue	30
praue	30
haue	28
sue	28
remoue	27
quisue	26
perspicue	24
exue	23
dominiue	22
congrue	21
indue	21
conclaue	20
strue	20
alioue	20
quodue	20
gradiue	20
procliue	20
pluribusue	19
moue	19
cuiusue	19
doloue	19
adnue	17
incongrue	17
inue	17
agaue	17
oue	16
diue	16
patrue	16
quaue	15
bonorumue	15
festiue	15
dominoue	14
obrue	14
restitue	14
parue	14
usurpatiue	14
neptisue	14
ingenue	14
ripaue	13
eamue	13
dominusue	13
exigue	13
ignaue	13
aliisue	13
tresue	13
quasue	13
filiaue	12
amoue	12
exsangue	12
quosue	12
factumue	11
statue	11
legatumue	11
amboue	11
terue	11
cuiue	11
furtiue	11
decliu