In [1]:
import datetime as dt
import os
import sys

from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from nltk.tokenize.punkt import PunktLanguageVars
import pandas

In [2]:
p = PunktLanguageVars()

# Lexical diveristy by author

## Original text with diacritics

### Get status about corpus

In [3]:
t0 = dt.datetime.utcnow()

cleaned_dir = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')
dir_contents = os.listdir(cleaned_dir)

corpus_stats = {}

all_tokens_list = []
for doc_count, file in enumerate(dir_contents):
    file_path = os.path.join(cleaned_dir, file)
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    all_tokens_list += tokens

print('Total author files:', doc_count)
print('Total words:', len(all_tokens_list))
all_tokens_unique = set(all_tokens_list)
print('Total unique words:', len(all_tokens_unique))

corpus_stats = {'doc_count': doc_count, 
               'total_words': len(all_tokens_list),
               'total_unique_words': len(all_tokens_unique)}

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

Total author files: 1822
Total words: 72057716
Total unique words: 1515193
... finished in 0:03:42.094842


In [4]:
df_corpus = pandas.DataFrame(corpus_stats, index=[0])
print(df_corpus)

   doc_count  total_unique_words  total_words
0       1822             1515193     72057716


### Get stats per author

In [5]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author

In [6]:
map_id_author = get_id_author()

In [7]:
t0 = dt.datetime.utcnow()

# Words and unique words per author
map_id_word_counts = {}
for file in dir_contents:
    map_word_counts = {}
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    map_word_counts['name'] = author
    map_word_counts['epithet'] = get_epithet_of_author(author_id)
    map_word_counts['word_count_all'] = len(tokens)
    map_word_counts['word_count_unique'] = len(set(tokens))
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    map_word_counts['lexical_diversity'] = lexical_diversity
    
    map_id_word_counts[author_id] = map_word_counts
#     print(author)
#     print('    ', 'Total words:', len(tokens))
#     print('    ', 'Total unique words:', len(set(tokens)))
#     print('    ', 'Lexical diversity:', lexical_diversity)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:03:46.526512


In [8]:
df_text_counts = pandas.DataFrame(map_id_word_counts).T

In [9]:
df_text_counts

Unnamed: 0,epithet,lexical_diversity,name,word_count_all,word_count_unique
0001,Epici/-ae,0.34366,Apollonius Rhodius Epic.,39155,13456
0002,Elegiaci,0.398142,Theognis Eleg.,9798,3901
0003,Historici/-ae,0.150405,Thucydides Hist.,150427,22625
0004,Biographi,0.222551,Diogenes Laertius Biogr.,110977,24698
0005,Bucolici,0.439063,Theocritus Bucol.,21719,9536
0006,Tragici,0.19995,Euripides Trag.,184076,36806
0007,Philosophici/-ae,0.110099,Plutarchus Biogr. et Phil.,1034650,113914
0008,Sophistae,0.151358,Athenaeus Soph.,394588,59724
0009,Lyrici/-ae,0.674388,Sappho Lyr.,3756,2533
0010,Oratores,0.139739,Isocrates Orat.,120603,16853


In [10]:
df_text_counts.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_text_counts.csv'))

### Stats by author

In [11]:
from statistics import mean
from statistics import stdev

In [12]:
author_stats = {}
corpus_word_count_all = []
corpus_word_count_unique = []
corpus_word_lexical_diversity = []
for author_id, map_counts in map_id_word_counts.items():
    corpus_word_count_all.append(map_counts['word_count_all'])
    corpus_word_count_unique.append(map_counts['word_count_unique'])
    corpus_word_lexical_diversity.append(map_counts['lexical_diversity'])

author_stats['mean_words_per_author'] = mean(corpus_word_count_all)
author_stats['standard_deviation_of_words_per_author:'] = stdev(corpus_word_count_all)
author_stats['mean_unique_words_per_author'] = mean(corpus_word_count_unique)
author_stats['standard_deviation_of_unique_words_per_author'] = stdev(corpus_word_count_unique)
author_stats['lexical_diversity_per_author'] = mean(corpus_word_lexical_diversity)
author_stats['standard_deviation_of_lexical_diversity_per_author:'] = stdev(corpus_word_lexical_diversity)

print('Mean words per author:', mean(corpus_word_count_all))
print('Standard deviation of words per author:', stdev(corpus_word_count_all))

print('Mean unique words per author:', mean(corpus_word_count_unique))
print('Standard deviation of unique words per author:', stdev(corpus_word_count_unique))

print('Lexical diversity per author:', mean(corpus_word_lexical_diversity))
print('Standard deviation of lexical diversity per author:', stdev(corpus_word_lexical_diversity))

Mean words per author: 39526.99725726824
Standard deviation of words per author: 174923.28976653758
Mean unique words per author: 5435.820076796489
Standard deviation of unique words per author: 14195.290142159112
Lexical diversity per author: 0.5171187962883808
Standard deviation of lexical diversity per author: 0.2732410961564417


In [13]:
df_authors = pandas.DataFrame(author_stats, index=[0])
df_authors

Unnamed: 0,lexical_diversity_per_author,mean_unique_words_per_author,mean_words_per_author,standard_deviation_of_lexical_diversity_per_author:,standard_deviation_of_unique_words_per_author,standard_deviation_of_words_per_author:
0,0.517119,5435.820077,39526.997257,0.273241,14195.290142,174923.289767


In [14]:
df_authors.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_authors.csv'))

### Get stats about epithets

In [15]:
from collections import defaultdict
import datetime as dt
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets

In [16]:
list_epithets = get_epithets()

In [17]:
t0 = dt.datetime.utcnow()

map_epithet_counts_all = defaultdict(list)
map_epithet_counts_unique = defaultdict(list)
map_epithet_lexical_diversity = defaultdict(list)
for file in dir_contents:
    map_word_counts = defaultdict(list)
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    epithet = get_epithet_of_author(author_id)

    map_epithet_counts_all[epithet].append(len(tokens))
    map_epithet_counts_unique[epithet].append(len(set(tokens)))
    map_epithet_lexical_diversity[epithet].append(lexical_diversity)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:03:47.450311


In [27]:
from statistics import StatisticsError

epithet_lexical_diversity_tuples = []
epithet_scores = {}
for epithet, counts in map_epithet_counts_all.items():
    print(epithet)
    print('    Mean of word counts per author:', mean(counts))
    try:
        wc_standard_deviation = stdev(counts)
    except StatisticsError:
        wc_standard_deviation = 0
    print('    Standard deviation of word counts per author:', wc_standard_deviation)
    
    uniques_list = map_epithet_counts_unique[epithet]
    print('    Mean of unique word counts per author:', mean(uniques_list))
    try:
        uniques_standard_deviation = stdev(uniques_list)
    except StatisticsError:
        uniques_standard_deviation = 0
    print('    Standard deviation of unique word counts per author:', uniques_standard_deviation)

    lexical_diversity_list = map_epithet_lexical_diversity[epithet]
    print('    Mean of lexical diversity per author:', mean(lexical_diversity_list))
    try:
        ld_standard_deviation = stdev(lexical_diversity_list)
    except StatisticsError:
        ld_standard_deviation = 0
    print('    Standard deviation of unique word counts per author:', ld_standard_deviation)
    
    epithet_lexical_diversity_tuples.append((epithet, mean(lexical_diversity_list)))

    tmp_scores = {}
    tmp_scores['mean_of_word_counts_ per_author'] = mean(counts)
    tmp_scores['standard_deviation_of_word_counts_per_author'] = wc_standard_deviation
    tmp_scores['mean_of_unique_word_counts_per_author'] = mean(uniques_list)
    tmp_scores['standard_deviation_of_unique_word_counts_per_author'] = uniques_standard_deviation
    tmp_scores['mean_of_lexical_diversity_per_author'] = mean(lexical_diversity_list)
    epithet_scores['standard_deviation_of_unique_word_counts_per_author'] = ld_standard_deviation
    epithet_scores[epithet] = tmp_scores

Theologici
    Mean of word counts per author: 364684.3333333333
    Standard deviation of word counts per author: 516832.31839165394
    Mean of unique word counts per author: 34570.433333333334
    Standard deviation of unique word counts per author: 35694.68958347893
    Mean of lexical diversity per author: 0.20601897214488737
    Standard deviation of unique word counts per author: 0.1272290014365555
Epistolographi
    Mean of word counts per author: 188624.9
    Standard deviation of word counts per author: 277744.39993212384
    Mean of unique word counts per author: 26995.3
    Standard deviation of unique word counts per author: 34568.82496653114
    Mean of lexical diversity per author: 0.25877639639342465
    Standard deviation of unique word counts per author: 0.14630914252355742
Periegetae
    Mean of word counts per author: 28730
    Standard deviation of word counts per author: 71070.64200673862
    Mean of unique word counts per author: 5329.444444444444
    Standard de

In [28]:
# sort epithets by lexical diversity
sorted(epithet_lexical_diversity_tuples, key=lambda x: x[1], reverse=True)

[('Choliambographi', 1.0),
 ('Epigrammatici/-ae', 0.8765955926554143),
 ('Elegiaci', 0.7915192092968645),
 ('Iambici', 0.7662045014476172),
 ('Lyrici/-ae', 0.7522432836596241),
 ('Poetae Didactici', 0.7354838709677419),
 ('Poetae Medici', 0.7121893719806763),
 ('Gnomici', 0.6288932022148066),
 ('Nomographi', 0.6267255659856433),
 ('Poetae', 0.6261052550051428),
 ('Mimographi', 0.6253901209264193),
 ('Tragici', 0.6186934024311059),
 ('Gnostici', 0.614900854930397),
 ('Poetae Philosophi', 0.6119466967099234),
 ('Parodii', 0.6014235088742101),
 ('Historici/-ae', 0.59413542667286),
 ('Epici/-ae', 0.5843152290326596),
 ('Paradoxographi', 0.5526884575900334),
 ('Bucolici', 0.5318337561964496),
 (None, 0.5262500789551204),
 ('Comici', 0.5163583294667708),
 ('Alchemistae', 0.4965830671471897),
 ('Philosophici/-ae', 0.4691200839376471),
 ('Geographi', 0.46230688215853827),
 ('Lexicographi', 0.45092513391608313),
 ('Grammatici', 0.44774563558440766),
 ('Apologetici', 0.4362444959064383),
 ('Rhet

In [29]:
pandas.DataFrame(epithet_lexical_diversity_tuples)

Unnamed: 0,0,1
0,Theologici,0.206019
1,Epistolographi,0.258776
2,Periegetae,0.423661
3,Philosophici/-ae,0.46912
4,Philologi,0.348436
5,Paroemiographi,0.36521
6,Historici/-ae,0.594135
7,Mythographi,0.415507
8,Tragici,0.618693
9,Doxographi,0.243327


In [33]:
df_epithet_scores = pandas.DataFrame(epithet_scores).T
df_epithet_scores

Unnamed: 0,mean_of_lexical_diversity_per_author,mean_of_unique_word_counts_per_author,mean_of_word_counts_ per_author,standard_deviation_of_unique_word_counts_per_author,standard_deviation_of_word_counts_per_author
Theologici,0.206019,34570.433333,364684.333333,35694.689583,516832.318392
Epistolographi,0.258776,26995.3,188624.9,34568.824967,277744.399932
Periegetae,0.423661,5329.444444,28730.0,10078.459953,71070.642007
Philosophici/-ae,0.46912,5651.167382,55756.905579,13429.395203,189062.969154
Mechanici,0.310574,3330.5,24746.375,4597.420333,54659.85651
Paroemiographi,0.36521,8053.0,22930.6,5665.462735,17919.046244
Historici/-ae,0.594135,3573.857567,19748.626113,9097.030604,67484.697427
Mythographi,0.415507,3290.5,9229.666667,2782.491527,9453.68685
Epigrammatici/-ae,0.876596,157.363636,200.090909,252.052252,350.622563
Tragici,0.618693,1114.670588,4161.788235,5230.739016,22768.237306


In [34]:
df_epithet_scores.to_csv(os.path.expanduser('~/cltk_data/user_data/stats_epithet.csv'))