In [3]:
import numpy as np
import pandas as pd
import os
import requests
import re
from nltk import FreqDist
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
from gensim.corpora import Dictionary
import unicodedata
import json

In [4]:
# find local version of the input dataset
local_paths = !find ~/Projects -name "LAGT_v2-0.json"
print(local_paths)

['/Users/kasev/Projects/LAGT/data/large_files/LAGT_v2-0.json']


In [5]:
# load LAGT (v2.0) dataset locally or download it directly from Zenodo
try:
    LAGT = pd.read_json(local_paths[0])
except:
    resp = requests.get("https://zenodo.org/record/7221150/files/LAGT_v2-0.json?download=1")
    # save it for next time
    LAGT = pd.DataFrame(resp.json())
    os.mkdir("../data/large_data/")
    LAGT.to_json("../data/large_data/LIRE_v2-0.json")

# additional cleaning

In [6]:
# remove one-letter words...
LAGT["lemmatized_sentences"] = LAGT["lemmatized_sentences"].apply(lambda x: [[w for w in s if len(w) > 1] for s in x])

In [7]:
LAGT_metadata = pd.read_csv("../data/LAGT_metadata.csv")
provenience_dict = dict(zip(LAGT_metadata["filename"],  LAGT_metadata["provenience"]))
author_dict = dict(zip(LAGT_metadata["author_id"],  LAGT_metadata["author"]))
LAGT["provenience"] = LAGT["filename"].apply(lambda x: provenience_dict[x])

# Explore overall vocabulary

In [8]:
sents = [sent for work in LAGT["lemmatized_sentences"] for sent in work]

In [9]:
len(sents)

2803645

In [10]:
min_freq = 10

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary

In [11]:
word_freqs, words, vocabulary = get_vocab(sents)

In [12]:
word_freqs[:100]

[('εἰμί', 461544),
 ('οὗτος', 293455),
 ('αὐτός', 261844),
 ('λέγω', 242398),
 ('γίγνομαι', 157086),
 ('ἔχω', 149368),
 ('πᾶς', 127741),
 ('πολύς', 121928),
 ('ἄλλος', 103309),
 ('φημί', 102944),
 ('τις', 91552),
 ('ποιέω', 85124),
 ('λόγος', 84616),
 ('θεός', 70419),
 ('μέγας', 56125),
 ('τοιοῦτος', 52980),
 ('ἄνθρωπος', 52604),
 ('πρῶτος', 50764),
 ('ἐκεῖνος', 50063),
 ('οὐδείς', 48157),
 ('ἕτερος', 47424),
 ('σῶμα', 45082),
 ('ἀγαθός', 43140),
 ('λαμβάνω', 42387),
 ('πόλις', 41764),
 ('μόνος', 41748),
 ('ὁράω', 41271),
 ('φύσις', 41145),
 ('ἀρχή', 37518),
 ('δύναμις', 37212),
 ('ἀνήρ', 37038),
 ('δοκέω', 36569),
 ('εἷς', 36407),
 ('ἕκαστος', 35169),
 ('τὶς', 34516),
 ('δύναμαι', 33323),
 ('χρόνος', 32486),
 ('δίδωμι', 32458),
 ('ὑπάρχω', 32163),
 ('γῆ', 32018),
 ('μέρος', 31923),
 ('ψυχή', 31296),
 ('καλέω', 31278),
 ('δέω', 30449),
 ('ἅπας', 29470),
 ('κύριος', 28854),
 ('βασιλεύς', 28755),
 ('δείκνυμι', 27793),
 ('γένος', 26375),
 ('πατήρ', 25693),
 ('εἶδος', 25645),
 ('τόπος', 25

# Generate ngrams

In [13]:
doc_ids = LAGT["doc_id"].tolist()
len(doc_ids)

1457

In [14]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    lagt_subset = LAGT[LAGT["doc_id"]==doc_id]
    sents = [sen for work in lagt_subset["lemmatized_sentences"] for sen in work]
    sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_bigrams + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [15]:
pickle.dump(ids_lines, open("../data/ids_lines.pickle", "wb"))

In [17]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid_wide.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    lagt_subset = LAGT[LAGT["doc_id"]==doc_id]
    sents = [sen for work in lagt_subset["lemmatized_sentences"] for sen in work]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    sents_fivegrams = [list(el) for sublist in [[ng for ng in nltk.ngrams(sent, n=5)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_trigrams + sents_fivegrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [19]:
pickle.dump(ids_lines, open("../data/ids_lines_wide.pickle", "wb"))

# Exploring subcorpora

In [19]:
def assign_subcorpus(row):
    if row["date_avr"] <= -5:
        subcorpus = "archaic"
    elif row["date_avr"] < -2:
        subcorpus = "classical"
    elif (row["provenience"]=="christian") & (row["date_avr"]<3):
        subcorpus = "christian"
    elif (row["provenience"]=="pagan") & (row["date_avr"]<3):
        subcorpus = "roman"
    elif (row["provenience"]=="jewish"):
        subcorpus = "jewish"
    else:
        subcorpus = None
    return subcorpus

In [20]:
LAGT["subcorpus"] = LAGT.apply(lambda row: assign_subcorpus(row), axis=1)

In [21]:
df_size = LAGT.groupby("subcorpus").size().reset_index()
df_size.rename(columns={0:"n_works"}, inplace=True)
df_size

Unnamed: 0,subcorpus,n_works
0,archaic,26
1,christian,104
2,classical,408
3,jewish,95
4,roman,596


In [22]:
n_authors = LAGT.groupby("subcorpus")["author_id"].unique().apply(len).tolist()
n_authors

[13, 30, 44, 6, 97]

In [23]:
df_sum = LAGT.groupby("subcorpus").sum()[["wordcount", "n_sentences", "GLAUx?"]]
subcorpora_overview = pd.merge(df_size, df_sum.reset_index())
subcorpora_overview["n_authors"] = n_authors
subcorpora_overview

Unnamed: 0,subcorpus,n_works,wordcount,n_sentences,GLAUx?,n_authors
0,archaic,26,331410,26561,24,13
1,christian,104,1711991,180658,19,30
2,classical,408,4104920,319695,370,44
3,jewish,95,1953311,166050,4,6
4,roman,596,11481179,932972,409,97


In [24]:
cols = ['subcorpus', 'n_authors', 'n_works', 'wordcount', 'n_sentences', 'GLAUx?']

In [25]:
subcorpora_overview = subcorpora_overview[cols].set_index("subcorpus")
subcorpora_overview

Unnamed: 0_level_0,n_authors,n_works,wordcount,n_sentences,GLAUx?
subcorpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
archaic,13,26,331410,26561,24
christian,30,104,1711991,180658,19
classical,44,408,4104920,319695,370
jewish,6,95,1953311,166050,4
roman,97,596,11481179,932972,409


In [26]:
subcorpora_overview.index

Index(['archaic', 'christian', 'classical', 'jewish', 'roman'], dtype='object', name='subcorpus')

In [27]:
LAGT_metadata = LAGT[['author_id', 'doc_id', 'GLAUx?', 'filename', 'subcorpus', 'author', 'title',
                      'date_avr', 'provenience', 'tlg_epithet', 'n_sentences', 'wordcount']]

In [28]:
LAGT_metadata.to_json("../data/large_data/LAGT_metadata.json")

# Preparing vocabularies

In [29]:
word_freqs, words, vocabulary = get_vocab(sents)

In [30]:
subcorpora = ['archaic', 'classical', 'roman', 'christian', 'jewish']

In [31]:
len(words)

14746005

In [32]:
len(vocabulary)

52252

In [33]:
subcorpora_vocabs = {}
for sub in subcorpora:
    subset = LAGT[LAGT["subcorpus"]==sub]
    sents = [sent for work in subset["lemmatized_sentences"] for sent in work]
    word_freqs, words, vocabulary = get_vocab(sents)
    vocabulary = [tup[0] for tup in word_freqs][:5000]
    subcorpora_vocabs[sub] = {
        "word_freqs" : word_freqs,
        "words" : words,
        "vocabulary" : vocabulary
    }

In [34]:
with open("../data/large_data/subcorpora_vocabs.pickle", "wb") as f:
    pickle.dump(subcorpora_vocabs, f)

In [35]:
subcorpora_vocabs["archaic"]["word_freqs"]

[('εἰμί', 3428),
 ('πᾶς', 1723),
 ('ἀνήρ', 1634),
 ('ἔρχομαι', 1623),
 ('λέγω', 1506),
 ('πολύς', 1499),
 ('θεός', 1429),
 ('ἔχω', 1412),
 ('Ζεύς', 1322),
 ('φημί', 1227),
 ('μέγας', 1171),
 ('ναῦς', 1129),
 ('φίλος', 1005),
 ('ὁράω', 936),
 ('ὅδε', 915),
 ('χείρ', 912),
 ('θυμός', 899),
 ('υἱός', 756),
 ('κακός', 753),
 ('γίγνομαι', 745),
 ('Ἀχαιός', 740),
 ('πατήρ', 729),
 ('Ὀδυσσεύς', 725),
 ('δίδωμι', 675),
 ('Τρώς', 649),
 ('παῖς', 626),
 ('γῆ', 618),
 ('αὐτός', 606),
 ('οὗτος', 605),
 ('ἄνθρωπος', 589),
 ('ἐμός', 586),
 ('φέρω', 580),
 ('ἵππος', 575),
 ('ἵστημι', 573),
 ('τίθημι', 569),
 ('ἀγαθός', 569),
 ('φρήν', 559),
 ('βάλλω', 554),
 ('ἔπος', 548),
 ('οἶδα', 546),
 ('αἱρέω', 533),
 ('μῦθος', 502),
 ('πρῶτος', 492),
 ('πόλις', 492),
 ('βαίνω', 484),
 ('ἐθέλω', 478),
 ('καλός', 474),
 ('ἄγω', 465),
 ('Ἕκτωρ', 456),
 ('ἑταῖρος', 452),
 ('γυνή', 439),
 ('τις', 439),
 ('δῖος', 434),
 ('ἔργον', 434),
 ('πούς', 416),
 ('Ἀχιλλεύς', 395),
 ('ἀθάνατος', 389),
 ('δῶμα', 387),
 ('μήτηρ',

In [36]:
religion_final = ["θεός", "Ζεύς", "εὐσεβής", 'ἱερός']
morality_final = ["ἀγαθός", "ἀρετή", "δίκαιος", "τιμή"]
keywords = religion_final + morality_final

In [38]:
subcorpus_keywords = []
for sub in subcorpora:
    for keyword in keywords:
        if keyword in subcorpora_vocabs[sub]["words"]:
            word_freqs = subcorpora_vocabs[sub]["word_freqs"]
            wordcount = sum([t[1] for t in word_freqs])
            for tup in enumerate(word_freqs):
                if tup[1][0] == keyword:
                    rank = tup[0]
                    rank_ratio = np.round(rank / len(word_freqs), 5)
                    count = tup[1][1]
                    freq = np.round(count / wordcount, 5)
                    subcorpus_keywords.append([sub, keyword, rank, rank_ratio, count, freq])
                    break
        else:
            subcorpus_keywords.append([sub, keyword, None, None, None, None])

In [39]:
sub = "archaic"
word_freqs = subcorpora_vocabs[sub]["word_freqs"]
len(word_freqs)

18311

In [40]:
keyword = "εὐσεβής"
for tup in enumerate(word_freqs):
    if tup[1][0] == keyword:
        print(tup)

(2545, ('εὐσεβής', 11))


In [41]:
np.round(2545 / 18311, 5)

0.13899

In [42]:
subcorpus_keywords_df = pd.DataFrame(subcorpus_keywords, columns=["subcorpus", "keyword", "rank", "rank_ratio", "count", "freq"])
subcorpus_keywords_df

Unnamed: 0,subcorpus,keyword,rank,rank_ratio,count,freq
0,archaic,θεός,6,0.00033,1429,0.0073
1,archaic,Ζεύς,8,0.00044,1322,0.00676
2,archaic,εὐσεβής,2545,0.13899,11,6e-05
3,archaic,ἱερός,205,0.0112,155,0.00079
4,archaic,ἀγαθός,35,0.00191,569,0.00291
5,archaic,ἀρετή,306,0.01671,107,0.00055
6,archaic,δίκαιος,418,0.02283,79,0.0004
7,archaic,τιμή,247,0.01349,129,0.00066
8,classical,θεός,37,0.00051,5050,0.00245
9,classical,Ζεύς,89,0.00124,2693,0.00131


In [91]:
len(subcorpora_vocabs["archaic"]["vocabulary"])

5000

In [96]:
shared_vocabulary = list((set(subcorpora_vocabs["archaic"]["vocabulary"])
    & set(subcorpora_vocabs["classical"]["vocabulary"])
    & set(subcorpora_vocabs["roman"]["vocabulary"])
    & set(subcorpora_vocabs["jewish"]["vocabulary"])
    & set(subcorpora_vocabs["christian"]["vocabulary"])))

In [98]:
len(shared_vocabulary)

1483

In [99]:
shared_vocabulary[:10]

['ἀπολύω',
 'κρέας',
 'φιάλη',
 'ἀνάπτω',
 'θαρσέω',
 'νεφέλη',
 'Ἑλένη',
 'ἀποστερέω',
 'στάσις',
 'ἐπιδίδωμι']

In [102]:
with open("../data/shared_vocabulary.pickle", "wb") as f:
    pickle.dump(shared_vocabulary, f)