In [4]:
import numpy as np
import pandas as pd
import os
import requests
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
from gensim.corpora import Dictionary
import unicodedata
import json

In [3]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LAGT_v2-0.json"
print(local_paths)

['/Users/kasev/Projects/LAGT/data/large_files/LAGT_v2-0.json']


In [84]:
os.mkdir("../data/large_data/")

In [5]:
# load LAGT (v2.0) dataset locally or download it directly from Zenodo
try:
    LAGT = pd.read_json(local_paths[0])
except:
    resp = requests.get("https://zenodo.org/record/7221150/files/LAGT_v2-0.json?download=1")
    # save it for next time
    LAGT = pd.DataFrame(resp.json())
    LAGT.to_json("../data/large_data/LIRE_v2-0.json")

# additional cleaning

In [10]:
# remove one-letter words...
LAGT["lemmatized_sentences"] = LAGT["lemmatized_sentences"].apply(lambda x: [[w for w in s if len(w) > 1] for s in x])


In [11]:
LAGT_metadata = pd.read_csv("../data/LAGT_metadata.csv")
provenience_dict = dict(zip(LAGT_metadata["filename"],  LAGT_metadata["provenience"]))
author_dict = dict(zip(LAGT_metadata["author_id"],  LAGT_metadata["author"]))
LAGT["provenience"] = LAGT["filename"].apply(lambda x: provenience_dict[x])

In [79]:
LAGT[LAGT["date_avr"].isnull()]

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,GLAUx?,subcorpus
753,tlg0092.tlg001.1st1K-grc1.xml,Anonymous,Geographiae expositio compendiaria,4428,tlg0092,tlg0092.tlg001,Incertum,,{},,,[],". Ἡ τῆς ὅλης γῆς περίμετρος σταδίων ,κα μυριάδ...",277,"[[], [ὅλος, γῆ, περίμετρος, στάδιον, μυριάς, π...",False,
845,tlg0527.tlg001.opp-grc2.xml,Septuaginta,Genesis,41465,tlg0527,tlg0527.tlg001,Varia,,{},,jewish,[],ΕΝ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν. ἡ ...,3257,"[[εἰμί, ποιέω, θεός, οὐρανός, γῆ], [γῆ, εἰμί, ...",False,jewish
846,tlg0527.tlg002.opp-grc2.xml,Septuaginta,Exodus,33068,tlg0527,tlg0527.tlg002,Varia,,{},,jewish,[],ταῦτα τὰ ὀνόματα τῶν υἱῶν Ἰσραὴλ τῶν εἰσπεπορε...,2227,"[[ὄνομα, υἱός, Ἰσραήλ, εἰσπεπορευμένων, Αἴγυπτ...",False,jewish
847,tlg0527.tlg003.opp-grc2.xml,Septuaginta,Leviticus,24834,tlg0527,tlg0527.tlg003,Varia,,{},,jewish,[],ἀνεκάλεσεν Μωυσῆν καὶ ἐλάλησεν Κύριος αὐτῷ ἐκ...,1370,"[[ἀνακαλέω, μωυσῆν, λαλέω, κύριος, σκηνή, μαρτ...",False,jewish
848,tlg0527.tlg004.opp-grc2.xml,Septuaginta,Numeri,32418,tlg0527,tlg0527.tlg004,Varia,,{},,jewish,[],ἐλάλησεν κύριος πρὸς Μωυσῆν ἐν τῇ ἐρήμῳ τῇ Σε...,2089,"[[λαλέω, κύριος, μωυσῆν, ἔρημος, σεινά, εἰμί, ...",False,jewish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,tlg5026.tlg007.First1K-grc1.xml,Scholia in Homerum,Scholia in Odysseam,205760,tlg5026,tlg5026.tlg007,Varia,,{},,pagan,[],Α. ΘΕΩΝ ἀγορά. Ἀθηνᾶς παραίνεσις πρὸς Τηλέμαχο...,49661,"[[], [θεός, ἀγορά], [Ἀθήνη, παραίνεσις, τηλεμά...",False,
1476,tlg5034.tlg001d.perseus-grc1.xml,Pindar Scholia,Scholia in Pindarum Isthmian Odes,18142,tlg5034,tlg5034.tlg001,Varia,,{},-4.5,pagan,[],. Ἐτελοῦντο μὲν οἱ παλαιοὶ πάντες ἀγῶνες ἐπί ...,2265,"[[], [τελέω, παλαιός, ἀγών, τετελευτηκόσιν], [...",False,
1477,tlg5037.tlg004.1st1K-grc1.xml,Scholia in Sophoclem,Scholia in Sophoclem (scholia vetera),80736,tlg5037,tlg5037.tlg004,Varia,,{},,pagan,[],πάρεστιν Ὀδυσσεὺς ἐπὶ τὴν σκηνὴν ἀγωνιῶν καὶ ....,12324,"[[πάρειμι, Ὀδυσσεύς, σκηνή, ἀγωνιάω], [πολυπρά...",False,
1478,tlg7000.tlg001.perseus-grc5.xml,,"Greek Anthology, Volume V",20222,tlg7000,tlg7000.tlg001,Varia,,{},,pagan,[],Ἰίαίρβ θεὰ Παφίη· σὴν γὰ̆ ἀεὶ δύναμιν κάλλος τ...,1971,"[[ἰίαίρβ, θεά, Πάφος], [σός, δύναμις, κάλλος, ...",False,


In [12]:
LAGT.head(5)

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,GLAUx?
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,Epici/-ae,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",3252,"[[ἄρχω, Φοῖβος, παλαιγενής, κλέος, φώς, μιμνῄσ...",True
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,Historici/-ae,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,6068,"[[Θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",True
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,110763,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,pagan,Biographi,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,10245,"[[φιλοσοφία, ἔργον, ἔνιοι, φημί, βάρβαρος, ἄρχ...",False
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,pagan,Bucolici,"̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...",1982,"[[ἡδύς, τις, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνο...",True
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,pagan,Bucolici,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,152,"[[ῥόδον, δροσόεις, κατάπυκνος, ἐκεῖνος, ἕρπυλλ...",True


In [14]:
sents = [sen for work in LAGT["lemmatized_sentences"] for sen in work]

In [15]:
len(sents)

2803645

In [16]:
min_freq = 50

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary

In [17]:
word_freqs, words, vocabulary = get_vocab(sents)

In [18]:
word_freqs[:100]

[('εἰμί', 461544),
 ('οὗτος', 293455),
 ('αὐτός', 261844),
 ('λέγω', 242398),
 ('γίγνομαι', 157086),
 ('ἔχω', 149368),
 ('πᾶς', 127741),
 ('πολύς', 121928),
 ('ἄλλος', 103309),
 ('φημί', 102944),
 ('τις', 91552),
 ('ποιέω', 85124),
 ('λόγος', 84616),
 ('θεός', 70419),
 ('μέγας', 56125),
 ('τοιοῦτος', 52980),
 ('ἄνθρωπος', 52604),
 ('πρῶτος', 50764),
 ('ἐκεῖνος', 50063),
 ('οὐδείς', 48157),
 ('ἕτερος', 47424),
 ('σῶμα', 45082),
 ('ἀγαθός', 43140),
 ('λαμβάνω', 42387),
 ('πόλις', 41764),
 ('μόνος', 41748),
 ('ὁράω', 41271),
 ('φύσις', 41145),
 ('ἀρχή', 37518),
 ('δύναμις', 37212),
 ('ἀνήρ', 37038),
 ('δοκέω', 36569),
 ('εἷς', 36407),
 ('ἕκαστος', 35169),
 ('τὶς', 34516),
 ('δύναμαι', 33323),
 ('χρόνος', 32486),
 ('δίδωμι', 32458),
 ('ὑπάρχω', 32163),
 ('γῆ', 32018),
 ('μέρος', 31923),
 ('ψυχή', 31296),
 ('καλέω', 31278),
 ('δέω', 30449),
 ('ἅπας', 29470),
 ('κύριος', 28854),
 ('βασιλεύς', 28755),
 ('δείκνυμι', 27793),
 ('γένος', 26375),
 ('πατήρ', 25693),
 ('εἶδος', 25645),
 ('τόπος', 25

In [19]:
len(vocabulary)

15243

# Prepare subcorpora

In [73]:
def assign_subcorpus(row):
    if row["date_avr"] <= -5:
        subcorpus = "archaic"
    elif row["date_avr"] < -2:
        subcorpus = "classical"
    elif (row["provenience"]=="christian") & (row["date_avr"]<3):
        subcorpus = "christian"
    elif (row["provenience"]=="pagan") & (row["date_avr"]<3):
        subcorpus = "roman"
    elif (row["provenience"]=="jewish"):
        subcorpus = "jewish"
    else:
        subcorpus = None
    return subcorpus

In [74]:
LAGT["date_avr"].apply(type).unique()

array([<class 'float'>], dtype=object)

In [75]:
LAGT["subcorpus"] = LAGT.apply(lambda row: assign_subcorpus(row), axis=1)

In [76]:
df_size = LAGT.groupby("subcorpus").size().reset_index()
df_size.rename(columns={0:"n_works"}, inplace=True)
df_size

Unnamed: 0,subcorpus,n_works
0,archaic,26
1,christian,104
2,classical,408
3,jewish,95
4,roman,596


In [77]:
df_sum = LAGT.groupby("subcorpus").sum()[["wordcount", "n_sentences", "GLAUx?"]]
pd.merge(df_size, df_sum.reset_index())

Unnamed: 0,subcorpus,n_works,wordcount,n_sentences,GLAUx?
0,archaic,26,331410,26561,24
1,christian,104,1711991,180658,19
2,classical,408,4104920,319695,370
3,jewish,95,1953311,166050,4
4,roman,596,11481179,932972,409


In [80]:
LAGT[LAGT["subcorpus"]=="archaic"]

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,GLAUx?,subcorpus
202,tlg0012.tlg001.perseus-grc2.xml,Homer,Iliad (Greek). Machine readable text,111888,tlg0012,tlg0012.tlg001,8 B.C.,-7.5,{'-7.5': 1},-7.5,pagan,Epici/-ae,"μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην, ...",8415,"[[μῆνις, ἀείδω, θεά, Πηληϊάδης, Ἀχιλλεύς, ὄλλυ...",True,archaic
203,tlg0012.tlg002.perseus-grc2.xml,Homer,Odyssey (Greek). Machine readable text,87177,tlg0012,tlg0012.tlg002,8 B.C.,-7.5,{'-7.5': 1},-7.5,pagan,Epici/-ae,"ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα π...",6729,"[[ἀνήρ, ἐνέπω, Μοῦσα, πολύτροπος, πολύς, πλάζω...",True,archaic
204,tlg0012.tlg003.perseus-grc1.xml,Homer,Ἐπιγράμματα,757,tlg0012,tlg0012.tlg003,8 B.C.,-7.5,{'-7.5': 1},-7.5,pagan,Epici/-ae,"Αἰδεῖσθε ξενίων κεχρημένον ἠδὲ δόμοιο, οἳ πόλι...",72,"[[Ἕλλην, πλῆθος, Ἴλιος, στρατόω, ἑπτά, εἰμί, μ...",True,archaic
205,tlg0013.tlg001.perseus-grc2.xml,Homeric hymn,Hymn 1 To Dionysus,144,tlg0013dyon,tlg0013.tlg001,8-6 B.C.,-6.5,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"οἳ μὲν γὰρ Δρακάνῳ σʼ, οἳ δʼ Ἰκάρῳ ἠνεμοέσσῃ ...",13,"[[Δράκανος, Ἴκαρος, ἠνεμόεις, φημί, Νάξος, δῖο...",True,archaic
206,tlg0013.tlg002.perseus-grc2.xml,Homeric hymn,Hymn 2 To Demeter,3373,tlg0013deme,tlg0013.tlg002,8-6 B.C.,-6.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"Δήμητρʼ ἠύκομον, σεμνὴν θεόν, ἄρχομʼ ἀείδειν,...",256,"[[Δημήτηρ, εὔκομος, σεμνός, θεός, ἄρχω, ἀείδω,...",True,archaic
207,tlg0013.tlg003.perseus-grc2.xml,Homeric hymn,Hymn 3 To Apollo,3848,tlg0013apol,tlg0013.tlg003,8-6 B.C.,-6.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"μνήσομαι οὐδὲ λάθωμαι Ἀπόλλωνος ἑκάτοιο, ὅντε ...",295,"[[μιμνῄσκω, λανθάνω, Ἀπόλλων, ἕκατος, θεός, δῶ...",True,archaic
208,tlg0013.tlg004.perseus-grc2.xml,Homeric hymn,Hymn 4 To Hermes,4031,tlg0013herm,tlg0013.tlg004,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"Ἑρμῆν ὕμνει, Μοῦσα, Διὸς καὶ Μαιάδος υἱόν, Κυλ...",332,"[[Ἑρμῆς, ὑμνέω, Μοῦσα, Ζεύς, Μαιάς, υἱός, Κυλλ...",True,archaic
209,tlg0013.tlg005.perseus-grc2.xml,Homeric hymn,Hymn 5 To Aphrodite,2047,tlg0013aphr,tlg0013.tlg005,8-6 B.C.,-6.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"μοῦσά μοι ἔννεπε ἔργα πολυχρύσου Ἀφροδίτης, Κύ...",148,"[[Μοῦσα, ἐνέπω, ἔργον, πολύχρυσος, Ἀφροδίτη, Κ...",True,archaic
211,tlg0013.tlg007.perseus-grc2.xml,Homeric hymn,Hymn 7 To Dionysus,425,tlg0013dyo2,tlg0013.tlg007,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφὶ Διώνυσον, Σεμέλης ἐρικυδέος υἱόν, μνήσομα...",42,"[[Διόνυσος, Σεμέλη, ἐρικυδής, υἱός, μιμνῄσκω, ...",True,archaic
223,tlg0013.tlg019.perseus-grc2.xml,Homeric hymn,Hymn 19 to Pan,336,tlg0013pan,tlg0013.tlg019,8-6 B.C.,-5.0,"{'-7.5': 0.33330000000000004, '-6.5': 0.333300...",-7.0,pagan,[],"ἀμφί μοι Ἑρμείαο φίλον γόνον ἔννεπε, Μοῦσα, αἰ...",19,"[[Ἑρμῆς, φίλος, γόνος, ἐνέπω, Μοῦσα, αἰγιπόδης...",True,archaic


In [81]:
doc_ids = LAGT[LAGT["subcorpus"].notnull()]["doc_id"].tolist()
len(doc_ids)

1229

# Generate ngrams data

In [86]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    lagt_subset = LAGT[LAGT["doc_id"]==doc_id]
    sents = [sen for work in lagt_subset["lemmatized_sentences"] for sen in work]
    sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_bigrams + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1