In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 50)
from nltk import FreqDist
import nltk
import pickle
import google_conf
import matplotlib.pyplot as plt
import os

In [None]:
# global parameters for plots
plt.style.use("seaborn-white")
plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams['font.size'] = 8

def save_figure(fig, fig_name):
    #each figure to be saved in four different versions
    fig.savefig("../figures/" + fig_name + ".tiff")
    fig.savefig("../figures/" + fig_name + ".png")

In [None]:
# point out to your google service account and a google sheet URL to which you wish to send your data
# if you dont have access to the linked gsheet file, skip this step
theos_data = google_conf.setup(sheet_url="https://docs.google.com/spreadsheets/d/19gfECdrg5rDiU2PhNYL1nKifeKzemCfYtYrAHQX5vXE/edit?usp=sharing", service_account_path="../../../ServiceAccountsKey.json")

In [None]:
# load LAGT (v3.0) dataset locally or download it directly from Zenodo
try:
    local_paths = !find ~/notebooks -name "LAGT_v3-0.parquet"
    print(local_paths)
    LAGT = pd.read_parquet(local_paths[0])
except:
    pass
    LAGT = pd.read_parquet("https://zenodo.org/records/10684841/files/LAGT_v3-0.parquet?download=1")
    os.mkdir("../data/large_data/")
    LAGT.to_parquet("../data/large_data/LAGT_v3-0.parquet")

# Short demonstration of the LAGT dataset...

In [None]:
LAGT.head(5)

In [None]:
len(LAGT)

In [None]:
LAGT["author_id"].nunique()

In [None]:

LAGT["wordcount"].sum()

In [None]:
LAGT[LAGT["author_id"].str.startswith("tlg0031")]

In [None]:
# extract a list of all lemmatized words from all lemmatized sentences
wordlist = [w for sent in [sent for work in LAGT["lemmatized_sentences"] for sent in work] for w in sent]
# count all instances of the word θεός
wordlist.count("θεός")

In [None]:
# filter for texts from the fifth and fourth c. BCE
len(LAGT[(LAGT["not_before"].between(-500,-301)) | (LAGT["not_before"].between(-500,-301))])

In [None]:
# index for aristotle's nicomachean ethics
i = LAGT[LAGT["doc_id"]=="tlg0086.tlg010"].index[0]
LAGT.loc[i]

# Explore overall vocabulary

In [None]:
sents = [sent for work in LAGT["lemmatized_sentences"] for sent in work]

In [None]:
len(sents)

In [None]:
min_freq = 10

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary

In [None]:
word_freqs, words, vocabulary = get_vocab(sents)

In [None]:
pd.DataFrame(word_freqs[:20], columns=["lemma", "count"])

# Generate ngrams

In [None]:
doc_ids = LAGT["doc_id"].tolist()
len(doc_ids)

In [None]:
#!mkdir ../data/large_data

In [None]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid_wide.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    lagt_subset = LAGT[LAGT["doc_id"]==doc_id]
    sents = [sen for work in lagt_subset["lemmatized_sentences"] for sen in work]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    sents_fivegrams = [list(el) for sublist in [[ng for ng in nltk.ngrams(sent, n=5)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_fivegrams # + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [None]:
pickle.dump(ids_lines, open("../data/ids_lines_wide.pickle", "wb"))

# Ngram example

In [None]:
# index for aristotle's nicomachean ethics
i = LAGT[LAGT["doc_id"]=="tlg0086.tlg010"].index[0]

In [None]:
i

In [None]:
LAGT.loc[i]["string"].split("·")[0]

In [None]:
sent = LAGT.loc[i]["lemmatized_sentences"][0]
print(list(sent))

In [None]:
trigrams = [list(ng) for ng in nltk.trigrams(sent)]
fivegrams = [list(ng) for ng in nltk.ngrams(sent, n=5)]
ngrams_example = [list(sent)] + trigrams + fivegrams
print(ngrams_example)

In [None]:
len(ngrams_example)

# Exploring subcorpora

In [None]:
periods = {
    "archaic" : (-800, -501),
    "classical" : (-500,-301),
    "hellenistic" : (-300,-1),
    "roman_peak" : (1, 200),
    "roman_late" : (201, 400)
}   

def get_periods(row):
    not_before = row["not_before"]
    not_after = row["not_after"]
    periods_covered = []
    try:
        for period_key in periods.keys():
            period = periods[period_key]
            if period[1] >= not_before and not_after >= period[0]:
                periods_covered.append(period_key)
    except:
        pass
    return periods_covered

In [None]:
LAGT["periods"] = LAGT.apply(get_periods, axis=1)

# Subcorpora IDs




In [None]:
subcorpora_ids_dict = {}

periods = ["archaic", "classical", "hellenistic", "roman_peak", "roman_late"]
for per in periods:
    prov = "pagan"
    LAGT_subset = LAGT[(LAGT["periods"].apply(lambda x: per in x)) & (LAGT["provenience"]==prov)]
    ids = list(LAGT_subset["doc_id"])
    subcorpora_ids_dict[prov + "_" + per] = ids

for per in ["roman_peak", "roman_late"]:
    prov = "christian"
    LAGT_subset = LAGT[(LAGT["periods"].apply(lambda x: per in x)) & (LAGT["provenience"]==prov)]
    ids = list(LAGT_subset["doc_id"])
    subcorpora_ids_dict[prov + "_" + per] = ids

prov = "jewish"
LAGT_subset = LAGT[(LAGT["provenience"]==prov)]
ids = list(LAGT_subset["doc_id"])
subcorpora_ids_dict["jewish"] = ids

subcorpora_ids_dict.keys()

In [None]:
# checking that it works "The Passion of Saints Perpetua and Felicity" which we corrected recetly
for key in subcorpora_ids_dict.keys():
    if "tlg2016.tlg001" in subcorpora_ids_dict[key]:
        print(key)

In [None]:
with open("../data/subcorpora_ids_dict.pickle", "wb") as f:
    pickle.dump(subcorpora_ids_dict, f)

In [None]:
with open("../data/subcorpora_ids_dict.pickle", "rb") as f:
    subcorpora_ids_dict = pickle.load(f)

In [None]:
subcorpora_readable_labels = ["Pagan Archaic", "Pagan Classical", "Pagan Hellensitic", "Pagan Roman (1-2 CE)", "Pagan Roman (3-6 CE)", "Christian (1-2 CE)", "Christian (3-6 CE)", "Jewish"]

In [None]:
subcorpora_overview = []
for period_key in subcorpora_ids_dict.keys():
    LAGT_subset = LAGT[LAGT["doc_id"].isin(subcorpora_ids_dict[period_key])]
    subcorpora_overview.append(
        {"subcorpus" : period_key,
         "works (N)" : len(LAGT_subset),
         "tokens (N)" : LAGT_subset["wordcount"].sum(),
         "lemmata (N)" : LAGT_subset["lemmatacount"].sum()
         }
    )
subcorpora_overview_df = pd.DataFrame(subcorpora_overview)
subcorpora_overview_df["subcorpus"] = subcorpora_readable_labels
subcorpora_overview_df

In [None]:
# TO-DO: numbers to english string format: 1000000 -> 1,000,000

In [None]:
#google_conf.set_with_dataframe(theos_data.add_worksheet("subcorpora_overview_labels", 1,1), subcorpora_overview_df)
subcorpora_overview_df.to_csv("../data/subcorpora_overview_df_labels.csv", index=False)

### Overview by century

In [None]:
centuries = [(str(int(n / -100)) + "BCE",  n, n+ 99) for n in range(-800, 0, 100)] + [(str(int(n / 100)) + "CE",  n - 99, n) for n in range(100, 700, 100)]
centuries

In [None]:
def get_sents(row):
    not_before = row["not_before"]
    not_after = row["not_after"]
    centuries_covered = []
    try:
        for cent in centuries:
            if cent[2] >= not_before and not_after >= cent[2]:
                centuries_covered.append(cent[0])
    except:
        pass
    return centuries_covered

In [None]:
LAGT["cents"] = LAGT.apply(get_sents, axis=1)

In [None]:
LAGT_subset = LAGT[LAGT["cents"].apply(lambda x: "8BCE" in x)]
LAGT_subset

In [None]:
LAGT["cents"].apply(lambda x: len(x) > 1).sum() / len(LAGT)

In [None]:
LAGT[LAGT["author"].str.contains("Septuagint", na=False)]


In [None]:
LAGT[LAGT["author"].str.contains("Plato", na=False)]

In [None]:
centuries_overview = []
for cent in centuries:
    LAGT_subset = LAGT[LAGT["cents"].apply(lambda x: cent[0] in x)]
    centuries_overview.append(
        {"period" : cent[0],
         "works_n" : len(LAGT_subset),
         "tokens_n" : LAGT_subset["wordcount"].sum(),
         "lemmata_n" : LAGT_subset["lemmatacount"].sum()
         }
    )
centuries_overview_df = pd.DataFrame(centuries_overview)
centuries_overview_df

In [None]:
centuries_overview = []
for cent in centuries:
    century_data = {}
    century_data["cent"] = cent
    for provenvience in ["pagan", "christian", "jewish"]:
        LAGT_subset = LAGT[(LAGT["cents"].apply(lambda x: cent[0] in x)) & (LAGT["provenience"]==provenvience)]
        century_data[provenvience + "_tokens_n"] = LAGT_subset["wordcount"].sum()
        century_data[provenvience + "_lemmata_n"] = LAGT_subset["lemmatacount"].sum()
    centuries_overview.append(century_data)
centuries_overview_df = pd.DataFrame(centuries_overview)
centuries_overview_df

In [None]:
centuries_overview_df["cent_label"] = centuries_overview_df["cent"].apply(lambda x: x[0])

In [None]:
google_conf.set_with_dataframe(theos_data.add_worksheet("centuries_overview", 1,1), centuries_overview_df)

In [None]:
colors = ["darkblue", "darkgreen", "darkred"]
periods_colors = ['lightcyan',
                  'lightgoldenrodyellow',
                  'lightgray',
                  'lightgreen',
                  'lightpink']

In [None]:
periods_dict = { # to avoid overlaps, startdates are postponed by one year, when needed
    "archaic" : {"startdate": -0.5, "enddate" : 2.5},
    "classical" : {"startdate": 2.5, "enddate": 4.5},
    "hellenistic" : {"startdate" : 4.5, "enddate" : 7.5},
    "roman_peak" : {"startdate": 7.5, "enddate": 9.5},
    "roman_late" : {"startdate" : 9.5, "enddate" : 13.5}}

In [None]:
periods_labels = ["Archaic", "Classical", "Hellenistic", "Roman\n(1-2 CE)", "Roman\n(3-6 CE)"]

In [None]:
import matplotlib.pyplot as plt

# Create the figure and axis
fig, ax = plt.subplots(figsize=(3.5, 2.5), tight_layout=True)

# Set y-ticks and labels
ax.set_yticks(range(0, 20000000, 2000000))
ax.set_yticklabels([str(n) + "M" for n in range(0, 20, 2)])
ax.set_ylim(0, 13000000)
ax.set_xlim(-1, 14)

# Highlight periods with axvspan and annotate each
for n, (period, color) in enumerate(zip(periods_dict.keys(), periods_colors)):
    ax.axvspan(periods_dict[period]["startdate"], periods_dict[period]["enddate"], color=color, alpha=0.3)
    midpoint = (periods_dict[period]["startdate"] + periods_dict[period]["enddate"]) / 2  # calculate midpoint for label
    ax.annotate(periods_labels[n], (midpoint, ax.get_ylim()[1] * 0.95), xytext=(0, 10),
                textcoords='offset points', ha='center', va='bottom', rotation=90)

# Plot the dataframe with stacked bars and store the returned AxesSubplot
centuries_overview_df[["pagan_tokens_n", "jewish_tokens_n", "christian_tokens_n"]].plot(kind='bar', color=colors,
                                                                                        stacked=True, ax=ax)

# Set x-ticks and labels
ax.set_xticks(range(0, 14))
ax.set_xticklabels(centuries_overview_df["cent_label"])

# Set custom legend labels for the bar plot
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, ["Pagan tokens", "Jewish tokens", "Christian tokens"], loc='best')

# Show plot
plt.show()


In [None]:
save_figure(fig, "centuries_overview")

# Building vocabulary data

In [None]:
def try_to_get_from_freqdict(word, word_freqs_dict):
    total_N = sum([val for val in word_freqs_dict.values()])
    try:
        count = word_freqs_dict[word]
        freq = count #/ total_N
    except:
        freq = 0
    return freq


min_freq = 5

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    total_words = len(words_flat)
    wordcounts_tups = FreqDist(words_flat).most_common()
    wordfreqs_tups = [(tup[0], tup[1] / total_words) for tup in wordcounts_tups]
    return wordcounts_tups, wordfreqs_tups

wordcounts_dicts = []
wordfreqs_dicts = []


for sub in subcorpora_ids_dict.keys():
    subset = LAGT[LAGT["doc_id"].isin(subcorpora_ids_dict[sub])]
    sents = [sent for work in subset["lemmatized_sentences"] for sent in work]
    wordcounts_tups, wordfreqs_tups = get_vocab(sents)
    wordcounts_dicts.append(dict(wordcounts_tups))
    wordfreqs_dicts.append(dict(wordfreqs_tups))

In [None]:
wordcounts_df = pd.DataFrame(wordcounts_dicts).T
wordcounts_df.columns = subcorpora_ids_dict.keys()
wordcounts_df.fillna(0, inplace=True)
wordcounts_df = wordcounts_df.astype(int)
wordcounts_df.head(5)

In [None]:
wordfreqs_df = pd.DataFrame(wordfreqs_dicts).T
wordfreqs_df.columns = subcorpora_ids_dict.keys()
wordfreqs_df.head(5)

In [None]:
wordfreqs_df["freq_avg"] = wordfreqs_df.mean(axis=1)

In [None]:
religion_final = ["θεός", "Ζεύς", "εὐσεβής", 'ἱερός']
morality_final = ["ἀγαθός", "ἀρετή", "δίκαιος", "τιμή"]

In [None]:
pd.concat([wordfreqs_df.loc[religion_final], wordfreqs_df.loc[morality_final]])

In [None]:
pd.concat([wordcounts_df.loc[religion_final], wordcounts_df.loc[morality_final]])

In [None]:
len(wordcounts_df)

In [None]:
wordcounts_df[(wordcounts_df >= 5).all(axis=1)].tail(10)# # .notnull().all(axis=1).sum()

In [None]:
filtered_df = wordcounts_df[(wordcounts_df >= 5).all(axis=1)]
len(filtered_df)
shared_vocabulary = filtered_df.index.tolist()
len(shared_vocabulary)

In [None]:
filtered_df["total"] = filtered_df.sum(axis=1)
filtered_df.sort_values("total", ascending=False).head(100)

In [120]:
morality_mft = google_conf.get_as_dataframe(theos_data.worksheet("morality_mft"))
morality_mft

Unnamed: 0,term,translation,moral_foundation
0,ἀγαθός,"good, virtuous","['Care/Harm', 'Fairness/Cheating']"
1,εὐεργετέω,"to do good, benefit",['Care/Harm']
2,εὐεργεσία,"benefaction, good deed",['Care/Harm']
3,εὐεργέτης,benefactor,['Care/Harm']
4,ἔλεος,"pity, mercy",['Care/Harm']
5,οἰκτίρω,"to pity, show compassion",['Care/Harm']
6,ἀγάπη,"love, charity",['Care/Harm']
7,φιλάνθρωπος,"benevolent, humanitarian",['Care/Harm']
8,δίκαιος,"just, righteous",['Fairness/Cheating']
9,δικαιοσύνη,"justice, righteousness",['Fairness/Cheating']


In [None]:
morality_extension = ["δικαιοσύνη", "δικαιόω", "ἀδικέω", "ἄδικος", "σωφροσύνη", "φίλος", "ἁμαρτία", "πονηρός", "κακός", "τιμάω", "εὐεργετέω", "εὐεργεσία", "εὐεργέτης", "σπουδαῖος", "ἐπαινετός","βέλτιστος", "βελτίων", "μεγαλοψυχία", "μεγαλόψυχος", "τιμάω", "εὔνοια", "καλός", "ἀλήθεια"]
religion_extension = ["εὐχή", "θεά", "θεῖος", "εὐσέβεια", "εὐσεβέω", "εὐλογέω", "εὐλογητός", "θεοφιλής", "ἀσεβέω", "ἀσεβής", "εὐσέβεια", "θύω", "δαίμων", "δαιμόνιον", "τελετή", "θεῖος", "ἱερόν", "εὔχομαι", "ναός", "καθιερόω", "ἱερή"]

In [None]:
wordcounts_df.loc[morality_extension + religion_extension]

In [None]:
wordfreqs_df.loc[morality_extension + religion_extension]

In [None]:
google_conf.set_with_dataframe(theos_data.add_worksheet("shared_vocabulary", 1,1), filtered_df.sort_values("total", ascending=False).reset_index())

In [None]:
wordfreqs_df["freq_avg"] = wordfreqs_df.mean(axis=1)

In [None]:
google_conf.set_with_dataframe(theos_data.add_worksheet("shared_vocabulary_freqs", 1,1), wordfreqs_df.loc[shared_vocabulary].sort_values("freq_avg", ascending=False).reset_index())

In [None]:
vocabulary_mostcommon2000 = []
for key in subcorpora_ids_dict.keys():
    vocabulary_mostcommon2000.extend(list(wordcounts_df.sort_values(key, ascending=False).index[:2000]))

In [None]:
len(list(set(vocabulary_mostcommon2000)))

In [None]:
len(list(set(vocabulary_mostcommon2000 + shared_vocabulary)))

In [None]:
vocabulary_balanced = list(set(vocabulary_mostcommon2000 + shared_vocabulary))

In [None]:
wordfreqs_df.loc[vocabulary_balanced].head(100)

In [None]:
with open("../data/vocabulary_balanced.pickle", "wb") as f:
    pickle.dump(vocabulary_balanced, f)

In [None]:
wordcounts_df.head(5)

In [None]:
for key in subcorpora_ids_dict.keys():
    cover_n = wordcounts_df.loc[vocabulary_balanced][key].sum()
    total_n = wordcounts_df[key].sum()
    proportion = cover_n / total_n
    print(key, cover_n, total_n, proportion)

In [None]:
wordcounts_df[(wordcounts_df.loc[vocabulary_balanced]) & ]

In [None]:
(wordcounts_df > 0).all(axis=1).sum()

In [None]:
filtered_df = wordcounts_df.loc[vocabulary_balanced]
len(filtered_df[(filtered_df > 0).all(axis=1)])

In [None]:
filtered_df = wordcounts_df.loc[vocabulary_balanced]
len(filtered_df[(filtered_df >= 5).all(axis=1)])

In [None]:
filtered_df = wordcounts_df.loc[vocabulary_balanced]
filtered_df = filtered_df[(filtered_df >= 10).all(axis=1)]
len(filtered_df)

In [None]:
shared_vocabulary = list(filtered_df.index)

In [None]:
with open("../data/shared_vocabulary.pickle", "wb") as f:
    pickle.dump(shared_vocabulary, f)

In [None]:
wordcounts_df.to_json("../data/large_data/wordcounts_df.json")
wordfreqs_df.to_json("../data/large_data/wordfreqs_df.json")

In [None]:
wordcounts_df = pd.read_json("../data/large_data/wordcounts_df.json")

In [None]:
LAGT.to_parquet("../data/large_data/LAGT_theos.parquet")