In [1]:
import os
import sys

from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from nltk.tokenize.punkt import PunktLanguageVars

In [2]:
p = PunktLanguageVars()

# Lexical diveristy by author

## Original text with diacritics

### Get status about corpus

In [3]:
cleaned_dir = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext_clean')
dir_contents = os.listdir(cleaned_dir)

all_tokens_list = []

for doc_count, file in enumerate(dir_contents):
    file_path = os.path.join(cleaned_dir, file)
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    all_tokens_list += tokens

print('Total author files:', doc_count)
print('Total words:', len(all_tokens_list))
all_tokens_unique = set(all_tokens_list)
print('Total unique words:', len(all_tokens_unique))

Total author files: 1822
Total words: 72057716
Total unique words: 1515193


### Get stats per author

In [7]:
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author

In [4]:
map_id_author = get_id_author()

In [19]:
# Words and unique words per author
map_id_word_counts = {}
for file in dir_contents:
    map_word_counts = {}
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    map_word_counts['name'] = author
    map_word_counts['epithet'] = get_epithet_of_author(author_id)
    map_word_counts['word_count_all'] = len(tokens)
    map_word_counts['word_count_unique'] = len(set(tokens))
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    map_word_counts['lexical_diversity'] = lexical_diversity
    
    map_id_word_counts[author_id] = map_word_counts
    print(author)
    print('    ', 'Total words:', len(tokens))
    print('    ', 'Total unique words:', len(set(tokens)))
    print('    ', 'Lexical diversity:', lexical_diversity)

Lepidus Hist.
     Total words: 122
     Total unique words: 98
     Lexical diversity: 0.8032786885245902
Melito Trag.
     Total words: 0
     Total unique words: 0
     Lexical diversity: 0
[Polyidus] Trag.
     Total words: 19
     Total unique words: 18
     Lexical diversity: 0.9473684210526315
Archippus Comic.
     Total words: 613
     Total unique words: 304
     Lexical diversity: 0.49592169657422513
Martyrium Potamiaenae Et Basilidis
     Total words: 381
     Total unique words: 254
     Lexical diversity: 0.6666666666666666
Acta Phileae
     Total words: 881
     Total unique words: 443
     Lexical diversity: 0.5028376844494892
Menecrates Hist.
     Total words: 200
     Total unique words: 143
     Lexical diversity: 0.715
Marinus Phil.
     Total words: 2563
     Total unique words: 880
     Lexical diversity: 0.34334763948497854
Troilus Soph.
     Total words: 3224
     Total unique words: 1259
     Lexical diversity: 0.3905086848635236
Apollinaris Theol.
     Total wo

In [20]:
from statistics import mean
from statistics import stdev

In [21]:
corpus_word_count_all = []
corpus_word_count_unique = []
corpus_word_lexical_diversity = []
for author_id, map_counts in map_id_word_counts.items():
    corpus_word_count_all.append(map_counts['word_count_all'])
    corpus_word_count_unique.append(map_counts['word_count_unique'])
    corpus_word_lexical_diversity.append(map_counts['lexical_diversity'])

print('Mean words per author:', mean(corpus_word_count_all))
print('Standard deviation of words per author:', stdev(corpus_word_count_all))

print('Mean unique words per author:', mean(corpus_word_count_unique))
print('Standard deviation of unique words per author:', stdev(corpus_word_count_unique))

print('Lexical diversity per author:', mean(corpus_word_lexical_diversity))
print('Standard deviation of lexical diversity per author:', stdev(corpus_word_lexical_diversity))

Mean words per author: 39526.99725726824
Standard deviation of words per author: 174923.28976653758
Mean unique words per author: 5435.820076796489
Standard deviation of unique words per author: 14195.290142159112
Lexical diversity per author: 0.5171187962883808
Standard deviation of lexical diversity per author: 0.2732410961564417


### Get stats about epithets

In [34]:
from collections import defaultdict
import datetime as dt
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets

In [24]:
list_epithets = get_epithets()

In [35]:
t0 = dt.datetime.utcnow()

map_epithet_counts_all = defaultdict(list)
map_epithet_counts_unique = defaultdict(list)
map_epithet_lexical_diversity = defaultdict(list)
for file in dir_contents:
    map_word_counts = defaultdict(list)
    file_path = os.path.join(cleaned_dir, file)
    author_id = file[3:-4]
    author = map_id_author[author_id]
    with open(file_path) as fo:
        text = fo.read().lower()
    text = ''.join([char for char in text if char not in ['.']])
    tokens = p.word_tokenize(text)
    try:
        lexical_diversity = len(set(tokens)) / len(tokens)
    except ZeroDivisionError:
        lexical_diversity = 0
    epithet = get_epithet_of_author(author_id)

    map_epithet_counts_all[epithet].append(len(tokens))
    map_epithet_counts_unique[epithet].append(len(set(tokens)))
    map_epithet_lexical_diversity[epithet].append(lexical_diversity)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))

... finished in 0:03:52.213404


In [42]:
from statistics import StatisticsError

for epithet, counts in map_epithet_counts_all.items():
    print(epithet)
    print('    Mean of word counts per author:', mean(counts))
    try:
        wc_standard_deviation = stdev(counts)
    except StatisticsError:
        wc_standard_deviation = 0
    print('    Standard deviation of word counts per author:', wc_standard_deviation)
    
    uniques_list = map_epithet_counts_unique[epithet]
    print('    Mean of unique word counts per author:', mean(uniques_list))
    try:
        uniques_standard_deviation = stdev(uniques_list)
    except StatisticsError:
        uniques_standard_deviation = 0
    print('    Standard deviation of unique word counts per author:', uniques_standard_deviation)

    lexical_diversity_list = map_epithet_lexical_diversity[epithet]
    print('    Mean of lexical diversity per author:', mean(lexical_diversity_list))
    try:
        ld_standard_deviation = stdev(lexical_diversity_list)
    except StatisticsError:
        ld_standard_deviation = 0
    print('    Standard deviation of unique word counts per author:', ld_standard_deviation)

Onirocritici
    Mean of word counts per author: 38698.5
    Standard deviation of word counts per author: 36553.88505343857
    Mean of unique word counts per author: 7283.5
    Standard deviation of unique word counts per author: 8166.3762159234375
    Mean of lexical diversity per author: 0.1598640506435638
    Standard deviation of unique word counts per author: 0.060021036591121714
Geometri
    Mean of word counts per author: 113808
    Standard deviation of word counts per author: 97741.43902153273
    Mean of unique word counts per author: 4100.75
    Standard deviation of unique word counts per author: 1907.860472711077
    Mean of lexical diversity per author: 0.046396734394107485
    Standard deviation of unique word counts per author: 0.017133951688166348
Poetae
    Mean of word counts per author: 8304.2
    Standard deviation of word counts per author: 23315.700245949603
    Mean of unique word counts per author: 2486.65
    Standard deviation of unique word counts per auth

## Lemmatized text

# Lexical diversity by genre