In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 50)
import os
import requests
import re
from nltk import FreqDist
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
from gensim.corpora import Dictionary
import unicodedata
import json

In [2]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LAGT_v3-0.parquet"
print(local_paths)

['/Users/vojtechkase/Projects/LAGT/data/large_files/LAGT_v3-0.parquet']


In [3]:
# load LAGT (v2.0) dataset locally or download it directly from Zenodo
try:
    LAGT = pd.read_parquet(local_paths[0])
except:
    pass
    #resp = requests.get("https://zenodo.org/record/7221150/files/LAGT_v2-0.json?download=1")
    # save it for next time
    #LAGT = pd.DataFrame(resp.json())
    #os.mkdir("../data/large_data/")
    #LAGT.to_json("../data/large_data/LIRE_v2-0.json")

# Short demonstration of the LAGT dataset...

In [4]:
# download and display first 5 rows of the dataset
#LAGT = pd.read_json("https://zenodo.org/record/7221150/files/LAGT_v2-0.json?download=1")
LAGT.head(5)

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
9,tlg0006,tlg0006.tlg020,tlg0006.tlg020.1st1K-grc1.xml,Euripides,Fragmenta,ποίαν σε φῶμεν γαῖαν ἐκλελοιπότα πόλει ξενοῦσθ...,17708,1Kgr,"[[φημί, γῆ, ἐκλείπω, πόλις, ξενοῦσθαι], [πάτρα...",grecy,5 B.C.,-500.0,-401.0,False,[Tragici],pagan,10277
10,tlg0007,tlg0007.tlg146,tlg0007.tlg146.1st1K-grc1.xml,Plutarch,Παροιμίαι αἷς Ἀλεξανδρεῖς ἐχρῶντο,Οἴκοι τὰ Μιλήσια: ἐπὶ τῶν ὅποι μὴ προςήκει τὴν...,2685,1Kgr,"[[Μιλήσιος], [προςήκω, τρυφή, ἐπιδείκνυμι], [Ἀ...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,1488
11,tlg0007,tlg0007.tlg147,tlg0007.tlg147.1st1K-grc1.xml,Plutarch,Ἐκλογὴ περὶ τῶν ἀδυνάτων,Κατὰ πετρῶν σπείρεις. Πλίνθον πλύνεις. Δικτύῳ ...,143,1Kgr,"[[πέτρα, σπείρω], [Πλίνθος, πλύνω, Δίκτυον, ἄν...",grecy,A.D. 1-2,1.0,200.0,False,"[Biographi, Philosophici/-ae]",pagan,125
12,tlg0015,tlg0015.tlg001,tlg0015.tlg001.1st1K-grc1.xml,Herodian,Ab excessu divi Marci,\nΟἱ πλεῖστοι τῶν περὶ συγκομιδὴν ἱστορίας ἀσχ...,46751,1Kgr,"[[πολύς, συγκομιδή, ἱστορία, ἀσχολέω, ἔργον, γ...",glaux,A.D. 2-3,101.0,300.0,False,[Historici/-ae],pagan,25832
14,tlg0018,tlg0018.tlg001,tlg0018.tlg001.1st1K-grc1.xml,Philo Judaeus,De opificio mundi,Τῶν ἄλλων νομοθετῶν οἱ μὲν ἀκαλλώπιστα καὶ γυ...,24591,1Kgr,"[[ἄλλος, νομοθέτης, ἀκαλλώπιστος, γυμνάζω, νομ...",grecy,1 B.C.-A.D. 1,-100.0,100.0,False,[Philosophici/-ae],jewish,9589


In [5]:
len(LAGT)

1630

In [6]:
LAGT["author_id"].nunique()

312

In [7]:

LAGT["wordcount"].sum()

32236367

In [8]:
LAGT[LAGT["author_id"].str.startswith("tlg0031")]

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount
1525,tlg0031a,tlg0031.tlg001,tlg0031.tlg001.perseus-grc2.xml,,New Testament - Matthew,ΒΙΒΛΟΣ γενέσεως Ἰησοῦ Χριστοῦ υἱοῦ Δαυεὶδ υἱ...,18288,perseus,"[[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυί...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,8699
1526,tlg0031b,tlg0031.tlg002,tlg0031.tlg002.perseus-grc2.xml,,New Testament - Mark,\n\n ΑΡΧΗ τοῦ εὐαγγελίου Ἰη...,11275,perseus,"[[ἀρχή, εὐαγγέλιον, Ἰησοῦς, Χριστός], [γράφω, ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,5308
1527,tlg0031luke,tlg0031.tlg003,tlg0031.tlg003.perseus-grc2.xml,Luke (the evangelist),New Testament - Luke,\n\n ΕΠΕΙΔΗΠΕΡ ΠΟΛΛΟΙ ἐπεχε...,19459,perseus,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,9194
1528,tlg0031john,tlg0031.tlg004,tlg0031.tlg004.perseus-grc2.xml,Johnannine literature,New Testament - John,"\n\n ΕΝ ΑΡΧΗ ἦν ὁ λόγος, κα...",15592,perseus,"[[ἀρχή, εἰμί, λόγος, λόγος, εἰμί, θεός, θεός, ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,6892
1529,tlg0031luke,tlg0031.tlg005,tlg0031.tlg005.perseus-grc2.xml,Luke (the evangelist),New Testament - Acts,\n τὸν μὲν πρῶτον λόγον ἐποιησάμην περὶ πάντων...,18405,perseus,"[[πρῶτος, λόγος, ποιέω, πᾶς, Θεόφιλος, ἄρχω, Ἰ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,9141
1530,tlg0031paul,tlg0031.tlg006,tlg0031.tlg006.perseus-grc2.xml,Paul of Tarsus,New Testament - Romans,\n\n ΠΑΥΛΟΣ δοῦλος Ἰησοῦ Χρ...,7107,perseus,"[[Παῦλος, δοῦλος, Χριστός, Ἰησοῦς, κλητός, ἀπό...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,3265
1531,tlg0031paul,tlg0031.tlg007,tlg0031.tlg007.perseus-grc2.xml,Paul of Tarsus,New Testament - 1 Corinthians,\n\n ΠΑΥΛΟΣ κλητὸς ἀπόστολο...,6814,perseus,"[[Παῦλος, κλητός, ἀπόστολος, Χριστός, Ἰησοῦς, ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,3292
1532,tlg0031paul,tlg0031.tlg008,tlg0031.tlg008.perseus-grc2.xml,Paul of Tarsus,New Testament - 2 Corinthians,\n\n ΠΑΥΛΟΣ ἀπόστολος Χριστ...,4470,perseus,"[[Παῦλος, ἀπόστολος, Χριστός, Ἰησοῦς, θέλημα, ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,1925
1533,tlg0031paul,tlg0031.tlg009,tlg0031.tlg009.perseus-grc2.xml,Paul of Tarsus,New Testament - Galatians,"\n\n ΠΑΥΛΟΣ ἀπόστολος, οὐκ ...",2235,perseus,"[[Παῦλος, ἀπόστολος, ἄνθρωπος, ἄνθρωπος, Ἰησοῦ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,1058
1534,tlg0031pspa,tlg0031.tlg010,tlg0031.tlg010.perseus-grc2.xml,Pauline literature,New Testament - Ephesians,\n\n ΠΑΥΛΟΣ ἀπόστολος Χριστ...,2421,perseus,"[[Παῦλος, ἀπόστολος, Χριστός, Ἰησοῦς, θέλημα, ...",morphgnt,A.D. 1,1.0,100.0,False,[],christian,1126


In [9]:
# extract a list of all lemmatized words from all lemmatized sentences
wordlist = [w for sent in [sent for work in LAGT["lemmatized_sentences"] for sent in work] for w in sent]
# count all instances of the word θεός
wordlist.count("θεός")

60001

In [10]:
# filter for texts from the fifth and fourth c. BCE
len(LAGT[(LAGT["not_before"].between(-500,-301)) | (LAGT["not_before"].between(-500,-301))])

438

In [11]:
# index for aristotle's nicomachean ethics
i = LAGT[LAGT["doc_id"]=="tlg0086.tlg010"].index[0]
LAGT.loc[i]

author_id                                                         tlg0086
doc_id                                                     tlg0086.tlg010
filename                                  tlg0086.tlg010.perseus-grc1.xml
author                                                          Aristotle
title                                                  Nicomachean Ethics
string                  πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς ...
wordcount                                                           56630
source                                                            perseus
lemmatized_sentences    [[πᾶς, τέχνη, πᾶς, μέθοδος, πρᾶξις, προαίρεσις...
lemmata_source                                                      glaux
tlg_date                                                           4 B.C.
not_before                                                         -400.0
not_after                                                          -301.0
date_uncertain                        

# Explore overall vocabulary

In [12]:
sents = [sent for work in LAGT["lemmatized_sentences"] for sent in work]

In [13]:
len(sents)

2135872

In [14]:
min_freq = 10

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary

In [15]:
word_freqs, words, vocabulary = get_vocab(sents)

In [16]:
pd.DataFrame(word_freqs[:20], columns=["lemma", "count"])

Unnamed: 0,lemma,count
0,οὗτος,329519
1,λέγω,239774
2,εἰμί,228051
3,αὐτός,213843
4,γίγνομαι,157432
5,ἔχω,146904
6,πολύς,121203
7,πᾶς,104573
8,φημί,99973
9,ἄλλος,96950


# Generate ngrams

In [17]:
doc_ids = LAGT["doc_id"].tolist()
len(doc_ids)

1630

In [18]:
#!mkdir ../data/large_data

In [19]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid_wide.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    lagt_subset = LAGT[LAGT["doc_id"]==doc_id]
    sents = [sen for work in lagt_subset["lemmatized_sentences"] for sen in work]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    sents_fivegrams = [list(el) for sublist in [[ng for ng in nltk.ngrams(sent, n=5)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_fivegrams # + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [20]:
pickle.dump(ids_lines, open("../data/ids_lines_wide.pickle", "wb"))

# Ngram example

In [21]:
# index for aristotle's nicomachean ethics
i = LAGT[LAGT["doc_id"]=="tlg0086.tlg010"].index[0]

In [22]:
i

'1736'

In [23]:
LAGT.loc[i]["string"].split("·")[0]

'πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς τε καὶ προαίρεσις, ἀγαθοῦ τινὸς ἐφίεσθαι δοκεῖ'

In [24]:
sent = LAGT.loc[i]["lemmatized_sentences"][0]
print(sent)

['πᾶς' 'τέχνη' 'πᾶς' 'μέθοδος' 'πρᾶξις' 'προαίρεσις' 'ἀγαθός' 'τις'
 'ἐφίημι' 'δοκέω']


In [38]:
trigrams = [list(ng) for ng in nltk.trigrams(sent)]
fivegrams = [list(ng) for ng in nltk.ngrams(sent, n=5)]
print([sent] + trigrams + fivegrams)

[array(['πᾶς', 'τέχνη', 'πᾶς', 'μέθοδος', 'πρᾶξις', 'προαίρεσις', 'ἀγαθός',
       'τις', 'ἐφίημι', 'δοκέω'], dtype=object), ['πᾶς', 'τέχνη', 'πᾶς'], ['τέχνη', 'πᾶς', 'μέθοδος'], ['πᾶς', 'μέθοδος', 'πρᾶξις'], ['μέθοδος', 'πρᾶξις', 'προαίρεσις'], ['πρᾶξις', 'προαίρεσις', 'ἀγαθός'], ['προαίρεσις', 'ἀγαθός', 'τις'], ['ἀγαθός', 'τις', 'ἐφίημι'], ['τις', 'ἐφίημι', 'δοκέω'], ['πᾶς', 'τέχνη', 'πᾶς', 'μέθοδος', 'πρᾶξις'], ['τέχνη', 'πᾶς', 'μέθοδος', 'πρᾶξις', 'προαίρεσις'], ['πᾶς', 'μέθοδος', 'πρᾶξις', 'προαίρεσις', 'ἀγαθός'], ['μέθοδος', 'πρᾶξις', 'προαίρεσις', 'ἀγαθός', 'τις'], ['πρᾶξις', 'προαίρεσις', 'ἀγαθός', 'τις', 'ἐφίημι'], ['προαίρεσις', 'ἀγαθός', 'τις', 'ἐφίημι', 'δοκέω']]


# Exploring subcorpora

In [70]:
periods = {
    "archaic" : (-800, -501),
    "classical" : (-500,-301),
    "hellenistic" : (-300,-1),
    "roman_peak" : (1, 200),
    "roman_late" : (201, 400)
}   

def get_periods(row):
    not_before = row["not_before"]
    not_after = row["not_after"]
    periods_covered = []
    try:
        for period_key in periods.keys():
            period = periods[period_key]
            if period[1] >= not_before and not_after >= period[0]:
                periods_covered.append(period_key)
    except:
        pass
    return periods_covered

In [71]:
LAGT["periods"] = LAGT.apply(get_periods, axis=1)

In [72]:
subcorpora_overview = []
for period_key in periods.keys():
    LAGT_subset = LAGT[LAGT["periods"].apply(lambda x: period_key in x)]
    subcorpora_overview.append(
        {"period" : period_key,
         "works_n" : len(LAGT_subset),
         "words_n" : LAGT_subset["wordcount"].sum()
         }
    )
        

In [73]:
subcorpora_overview_df = pd.DataFrame(subcorpora_overview)
subcorpora_overview_df

Unnamed: 0,period,works_n,words_n
0,archaic,28,335750
1,classical,452,4132361
2,hellenistic,232,4163918
3,roman_peak,727,14199029
4,roman_late,305,9214476


In [74]:
provenience_overview = LAGT.groupby("provenience").agg(
    works_n=('provenience', 'size'),
    words_n=('wordcount', 'sum')
)
provenience_overview

Unnamed: 0_level_0,works_n,words_n
provenience,Unnamed: 1_level_1,Unnamed: 2_level_1
christian,198,7568232
jewish,92,2103563
pagan,1278,20285785


### Overview by century

In [75]:
centuries = [(str(int(n / -100)) + "BC",  n, n+ 99) for n in range(-800, 0, 100)] + [(str(int(n / 100)) + "CE",  n - 99, n) for n in range(100, 700, 100)]
centuries

[('8BC', -800, -701),
 ('7BC', -700, -601),
 ('6BC', -600, -501),
 ('5BC', -500, -401),
 ('4BC', -400, -301),
 ('3BC', -300, -201),
 ('2BC', -200, -101),
 ('1BC', -100, -1),
 ('1CE', 1, 100),
 ('2CE', 101, 200),
 ('3CE', 201, 300),
 ('4CE', 301, 400),
 ('5CE', 401, 500),
 ('6CE', 501, 600)]

In [76]:
def get_sents(row):
    not_before = row["not_before"]
    not_after = row["not_after"]
    centuries_covered = []
    try:
        for cent in centuries:
            if cent[2] >= not_before and not_after >= cent[2]:
                centuries_covered.append(cent[0])
    except:
        pass
    return centuries_covered

In [77]:
LAGT["cents"] = LAGT.apply(get_sents, axis=1)

In [78]:
LAGT_subset = LAGT[LAGT["cents"].apply(lambda x: "8BC" in x)]
LAGT_subset

Unnamed: 0,author_id,doc_id,filename,author,title,string,wordcount,source,lemmatized_sentences,lemmata_source,tlg_date,not_before,not_after,date_uncertain,tlg_epithet,provenience,lemmatacount,periods,cents
1375,tlg0012,tlg0012.tlg001,tlg0012.tlg001.perseus-grc2.xml,Homer,Ἰλιάς,"\nμῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην,...",111895,perseus,"[[μῆνις, ἀείδω, θεά, Πηληιάδης, Ἀχιλλεύς, οὐλό...",agdt,8 B.C.,-800.0,-701.0,False,[Epici/-ae],pagan,66484,[archaic],[8BC]
1376,tlg0012,tlg0012.tlg002,tlg0012.tlg002.perseus-grc2.xml,Homer,Ὀδύσσεια,"ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα π...",87177,perseus,"[[ἀνήρ, ἐνέπω, Μοῦσα, πολύτροπος, πλάζω, Τροία...",agdt,8 B.C.,-800.0,-701.0,False,[Epici/-ae],pagan,50049,[archaic],[8BC]
1498,tlg0020,tlg0020.tlg001,tlg0020.tlg001.perseus-grc2.xml,Hesiod,Theogony (Greek). Machine readable text,"\nΜουσάων Ἑλικωνιάδων ἀρχώμεθʼ ἀείδειν, αἵθʼ Ἑ...",7039,perseus,"[[Μοῦσα, Ἑλικωνιάδες, ἄρχω, ἀείδω, ʽἑλικών, ἔχ...",agdt,8/7 B.C.?,-800.0,-601.0,True,[Epici/-ae],pagan,4581,[archaic],"[8BC, 7BC]"
1499,tlg0020,tlg0020.tlg002,tlg0020.tlg002.perseus-grc2.xml,Hesiod,Works and Days (Greek). Machine readable text,"\nμοῦσαι Πιερίηθεν ἀοιδῇσιν κλείουσαι δεῦτε, Δ...",5856,perseus,"[[Μοῦσα, ἀοιδή, κλείω, Ζεύς, ἐνέπω, σφέτερος, ...",agdt,8/7 B.C.?,-800.0,-601.0,True,[Epici/-ae],pagan,3611,[archaic],"[8BC, 7BC]"
1500,tlg0020,tlg0020.tlg003,tlg0020.tlg003.perseus-grc2.xml,Hesiod,Shield of Heracles (Greek). Machine readable text,\nἢ οἵη προλιποῦσα δόμους καὶ πατρίδα γαῖαν ἤλ...,3298,perseus,"[[οἷος, προλείπω, δόμος, πατρίς, γαῖα, ἔρχομαι...",agdt,8/7 B.C.?,-800.0,-601.0,True,[Epici/-ae],pagan,2081,[archaic],"[8BC, 7BC]"


In [79]:
centuries_overview = []
for cent in centuries:
    LAGT_subset = LAGT[LAGT["cents"].apply(lambda x: cent[0] in x)]
    centuries_overview.append(
        {"period" : cent[0],
         "works_n" : len(LAGT_subset),
         "words_n" : LAGT_subset["wordcount"].sum()
         }
    )
centuries_overview_df = pd.DataFrame(centuries_overview)
centuries_overview_df

Unnamed: 0,period,works_n,words_n
0,8BC,5,215265
1,7BC,6,25509
2,6BC,23,120485
3,5BC,266,2277846
4,4BC,391,3442762
5,3BC,149,2238497
6,2BC,83,1500282
7,1BC,74,1851026
8,1CE,296,4367953
9,2CE,621,11801098


In [82]:
#translated_df = pd.read_json("../data/translated_df.json")
#translated_df

In [83]:
def try_to_get_from_freqdict(word, word_freqs_dict):
    total_N = sum([val for val in word_freqs_dict.values()])
    try:
        count = word_freqs_dict[word]
        freq = count #/ total_N
    except:
        freq = 0
    return freq


min_freq = 5

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    total_words = len(words_flat)
    wordcounts_tups = FreqDist(words_flat).most_common()
    wordfreqs_tups = [(tup[0], tup[1] / total_words) for tup in wordcounts_tups]
    return wordcounts_tups, wordfreqs_tups

wordcounts_dicts = []
wordfreqs_dicts = []


for sub in periods.keys():
    subset = LAGT[LAGT["periods"].apply(lambda x: sub in x)]
    sents = [sent for work in subset["lemmatized_sentences"] for sent in work]
    wordcounts_tups, wordfreqs_tups = get_vocab(sents)
    wordcounts_dicts.append(dict(wordcounts_tups))
    wordfreqs_dicts.append(dict(wordfreqs_tups))

In [84]:
wordcounts_df = pd.DataFrame(wordcounts_dicts).T
wordcounts_df.columns = periods.keys()
wordcounts_df.head(5)

Unnamed: 0,archaic,classical,hellenistic,roman_peak,roman_late
εἰμί,2975.0,74864.0,37808.0,113388.0,24922.0
πᾶς,1625.0,19939.0,17267.0,42860.0,24363.0
ἀνήρ,1608.0,8559.0,5977.0,13606.0,7555.0
αὐτός,1526.0,42501.0,25916.0,129660.0,38207.0
θεός,1454.0,5594.0,9751.0,29952.0,27200.0


In [85]:
wordfreqs_df = pd.DataFrame(wordfreqs_dicts).T
wordfreqs_df.columns = periods.keys()
wordfreqs_df.head(5)

Unnamed: 0,archaic,classical,hellenistic,roman_peak,roman_late
εἰμί,0.015068,0.036587,0.020051,0.018114,0.007074
πᾶς,0.00823,0.009744,0.009157,0.006847,0.006915
ἀνήρ,0.008144,0.004183,0.00317,0.002174,0.002144
αὐτός,0.007729,0.020771,0.013744,0.020714,0.010844
θεός,0.007364,0.002734,0.005171,0.004785,0.00772


In [86]:
religion_final = ["θεός", "Ζεύς", "εὐσεβής", 'ἱερός']
morality_final = ["ἀγαθός", "ἀρετή", "δίκαιος", "τιμή"]

In [87]:
pd.concat([wordfreqs_df.loc[religion_final], wordfreqs_df.loc[morality_final]])

Unnamed: 0,archaic,classical,hellenistic,roman_peak,roman_late
θεός,0.007364,0.002734,0.005171,0.004785,0.00772
Ζεύς,0.006463,0.001315,0.000492,0.000736,0.000444
εὐσεβής,6.1e-05,5.9e-05,6.6e-05,5.4e-05,0.000125
ἱερός,0.000669,0.00054,0.000795,0.000595,0.000323
ἀγαθός,0.001286,0.004563,0.002314,0.00301,0.003137
ἀρετή,0.000537,0.001141,0.001219,0.000869,0.000934
δίκαιος,0.000405,0.001734,0.00085,0.000767,0.001091
τιμή,0.000664,0.000429,0.000537,0.000455,0.000434


In [114]:
pd.concat([wordcounts_df.loc[religion_final], wordcounts_df.loc[morality_final]])

Unnamed: 0,archaic,classical,hellenistic,roman_peak,roman_late
θεός,1454.0,5594.0,9751.0,29952.0,27200.0
Ζεύς,1276.0,2690.0,928.0,4609.0,1563.0
εὐσεβής,12.0,121.0,124.0,339.0,440.0
ἱερός,132.0,1105.0,1499.0,3722.0,1139.0
ἀγαθός,254.0,9336.0,4363.0,18842.0,11051.0
ἀρετή,106.0,2334.0,2299.0,5442.0,3290.0
δίκαιος,80.0,3548.0,1603.0,4802.0,3844.0
τιμή,131.0,877.0,1012.0,2846.0,1529.0


# Preparing vocabularies

In [95]:
len(wordcounts_df)

507365

In [113]:
wordcounts_df[(wordcounts_df >= 5).all(axis=1)].tail(10)# # .notnull().all(axis=1).sum()

Unnamed: 0,archaic,classical,hellenistic,roman_peak,roman_late
ἀπαλλαγή,5.0,84.0,72.0,210.0,178.0
ἐλευθερόω,5.0,156.0,175.0,311.0,157.0
εὔφημος,5.0,32.0,10.0,73.0,35.0
λύμη,5.0,26.0,22.0,51.0,55.0
χλιδή,5.0,20.0,7.0,32.0,22.0
μεσημβρινός,5.0,18.0,215.0,341.0,51.0
κατεύχομαι,5.0,21.0,9.0,17.0,5.0
φαιδρός,5.0,25.0,20.0,151.0,122.0
ἀποστατέω,5.0,28.0,7.0,20.0,24.0
πρεσβεύω,5.0,179.0,190.0,445.0,255.0


In [132]:
shared_vocabulary = list(wordcounts_df[(wordcounts_df >= 5).all(axis=1)].index) # .notnull().all(axis=1).sum())
len(shared_vocabulary)

3133

In [133]:
with open("../data/shared_vocabulary.pickle", "wb") as f:
    pickle.dump(shared_vocabulary, f)

In [134]:
vocabulary_mostcommon2000 = []
for key in periods.keys():
    vocabulary_mostcommon2000.extend(list(wordcounts_df.sort_values(key, ascending=False).index[:2000]))

In [135]:
len(list(set(vocabulary_mostcommon2000 + shared_vocabulary)))

5110

In [136]:
vocabulary_balanced = list(set(vocabulary_mostcommon2000 + shared_vocabulary))
with open("../data/vocabulary_balanced.pickle", "wb") as f:
    pickle.dump(vocabulary_balanced, f)

In [122]:
wordcounts_df.to_json("../data/wordcounts_df.json")
wordfreqs_df.to_json("../data/wordfreqs_df.json")