In [1]:
import math
import numpy as np
import pandas as pd
import pickle
import nltk

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora
import sddk

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [2]:
# for exporting data to googlesheets/sciencedata
# (feel free to skip)

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
paul_results = gc.open_by_url("https://docs.google.com/spreadsheets/d/1h4M-gK9TPIfeTV528tUuPBfZF1wtcNCA10yIlJYqGTE/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
LAGTec = pd.read_json("../data/large_files/LAGTec.json")

In [4]:
len(LAGTec["author"].unique().tolist())

49

In [5]:
len(LAGTec)

148

In [6]:
LAGTec["wordcount"].sum()

3566823

In [7]:
LAGTec["lemmatized_sentences"].apply(lambda x: len([w for s in x for w in s])).sum()

1506066

In [22]:
LAGTec

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,sentences_paul,sentences_paul_N,sentences_apostle,sentences_apostle_N,paul_N
385,tlg0031.tlg001.perseus-grc2.xml,Gospel of Matthew,Gospel of Matthew,18288,tlg0031a,tlg0031.tlg001,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΒΙΒΛΟΣ γενέσεως Ἰησοῦ Χριστοῦ υἱοῦ Δαυεὶδ υἱο...,1276,"[[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυί...",[],0,"[[ἀπόστολος, ὄνομα, εἰμί]]",1,0
386,tlg0031.tlg002.perseus-grc2.xml,Gospel of Mark,Gospel of Mark,11274,tlg0031b,tlg0031.tlg002,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΑΡΧΗ τοῦ εὐαγγελίου Ἰησοῦ Χριστοῦ . Καθὼς γέγ...,790,"[[ἀρχή, εὐαγγέλιον, Ἰησοῦς, Χριστός], [γράφω, ...",[],0,"[[ποιέω, ἀπόστολος, ὀνομάζω, ἀποστέλλω, κηρύσσ...",2,0
387,tlg0031.tlg003.perseus-grc2.xml,Luke-Acts,Gospel of Luke,19458,tlg0031luke,tlg0031.tlg003,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΕΠΕΙΔΗΠΕΡ ΠΟΛΛΟΙ ἐπεχείρησαν ἀνατάξασθαι διήγ...,1274,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...",[],0,"[[γίγνομαι, ἡμέρα, προσφωνέω, μαθητής, ἐκλεξάμ...",6,0
388,tlg0031.tlg004.perseus-grc2.xml,Johnannine literature (New Testament),Gospel of John,15590,tlg0031john,tlg0031.tlg004,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],"ΕΝ ΑΡΧΗ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θ...",1164,"[[εἰμί, ἀρχή, εἰμί, λόγος, λόγος, εἰμί, θεός, ...",[],0,"[[λέγω, εἰμί, δοῦλος, μέγας, κύριος, ἀπόστολος...",1,0
389,tlg0031.tlg005.perseus-grc2.xml,Luke-Acts,Acts,18406,tlg0031luke,tlg0031.tlg005,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],"τὸν μὲν πρῶτον λόγον ἐποιησάμην περὶ πάντων, ...",960,"[[πρῶτος, λόγος, ποιέω, πᾶς, Θεόφιλος, ἄρχω, Ἰ...","[[διέρχομαι, ὅλος, νῆσος, Πάφος, εὑρίσκω, ἀνήρ...",125,"[[πρῶτος, λόγος, ποιέω, πᾶς, Θεόφιλος, ἄρχω, Ἰ...",26,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,tlg4081.tlg001.perseus-grc1.xml,Colluthus,Rape of Helen,2336,tlg4081,tlg4081.tlg001,A.D. 5,4.5,{'4.5': 1},,christian,Epici/-ae,"νύμφαι Τρωιάδες, ποταμοῦ Ξάνθοιο γενέθλη, αἳ π...",209,"[[νύμφη, Τρῳάς, ποταμός, Ξάνθος, γενέθλη, πλόκ...",[],0,[],0,0
1418,tlg4084.tlg001.opp-grc1.xml,Zosimus,Historia Nova,62236,tlg4084,tlg4084.tlg001,A.D. 5?,4.5,{'4.5': 1},4.5,christian,Historici/-ae,"Πολυβίῳ τῷ Μεγαλοπολίτῃ, μνήμῃ παραδοῦναι τὰ ...",2265,"[[πολυβίῳ, μεγαλοπολίτῃ, μνήμη, παραδίδωμι, ἀξ...",[],0,[],0,0
1421,tlg4089.tlg003.opp-grc1.xml,Theodoret,Historia ecclesiastica,108815,tlg4089,tlg4089.tlg003,A.D. 4-5,4.5,"{'3.5': 0.5, '4.5': 0.5}",4.5,christian,Theologici,Τάδε ἔνεστιν ἐν τῷ πρώτῳ τόμῳ τῆς Θεοδωρήτου ἐ...,13146,"[[ἔνειμι, πρῶτος, τόμῳ, θεοδωρήτου, ἐκκλησιαστ...","[[προτρεπόμενον, Παῦλος, οὗτος, βιάζω, Πέτρος,...",31,"[[εὐσεβίος, παλαιστῖνος, ἱερόν, ἀπόστολος, ἱστ...",23,33
1422,tlg4089.tlg004.1st1K-grc1.xml,Theodoret,Historia Religiosa,44518,tlg4089,tlg4089.tlg004,A.D. 4-5,4.5,"{'3.5': 0.5, '4.5': 0.5}",4.5,christian,Theologici,"Τῶν ἀρίστων ἀνδρῶν, καὶ τῆς ἀρετῆς ἀθλητῶν, κ...",2948,"[[ἄριστος, ἀνήρ, ἀρετή, ἀθλητής, καλός, ὁράω, ...","[[τοιοῦτος, παντευχία, φάλαγξ, στρατηγός, πρόμ...",9,"[[ῥηθησομένων, ἀπιστέω, εὔδηλος, μωσέως, Ἰησοῦ...",13,9


In [23]:
LAGTec_by_date = pd.DataFrame(LAGTec.groupby("date_avr").sum())
LAGTec_by_date.reset_index(inplace=True)

LAGTec_by_date["authors_N"] = LAGTec_by_date["date_avr"].apply(lambda x: len(LAGTec[LAGTec["date_avr"]==x]["author"].unique()))

LAGTec_by_date["works_N"] = LAGTec_by_date["date_avr"].apply(lambda x: (LAGTec["date_avr"]==x).sum())

LAGTec_by_date["paul_freq"] = LAGTec_by_date["paul_N"] / LAGTec_by_date["wordcount"]

LAGTec_by_date[["date_avr", "authors_N", "wordcount", "n_sentences", "paul_N", "paul_freq", "sentences_paul_N", "sentences_apostle_N"]]

Unnamed: 0,date_avr,authors_N,wordcount,n_sentences,paul_N,paul_freq,sentences_paul_N,sentences_apostle_N
0,0.5,13,151723,10108,160,0.001055,156,84
1,1.5,13,322401,41076,18,5.6e-05,18,92
2,2.5,4,1237867,129474,497,0.000401,485,644
3,3.5,9,1347705,117599,213,0.000158,205,690
4,4.5,10,507127,45061,184,0.000363,181,146


In [24]:
sents_1to3 = [sen for work in LAGTec[LAGTec["date_avr"]<3]["lemmatized_sentences"] for sen in work]
sents_4to5 = [sen for work in LAGTec[LAGTec["date_avr"]>3]["lemmatized_sentences"] for sen in work]

In [25]:
len(sents_1to3)

180658

In [26]:
print(sents_1to3[:3])

[['βίβλος', 'γένεσις', 'Ἰησοῦς', 'Χριστός', 'υἱός', 'Δαυίδ', 'υἱός', 'Ἀβραάμ'], ['Ἀβραάμ', 'γεννάω', 'Ἰσαάκ', 'Ἰσαάκ', 'γεννάω', 'Ἰακώβ', 'Ἰακώβ', 'γεννάω', 'Ἰούδας', 'ἀδελφός', 'Ἰούδας', 'γεννάω', 'Φάρες', 'Ζάρα', 'Θαμάρ', 'Φάρες', 'γεννάω', 'Ἑσρώμ', 'Ἑσρώμ', 'γεννάω', 'Ἀράμ', 'Ἀράμ', 'γεννάω', 'Ἀμιναδάβ', 'Ἀμιναδάβ', 'γεννάω', 'Ναασσών', 'Ναασσών', 'γεννάω', 'Σαλμών', 'Σαλμών', 'γεννάω', 'βοῦς', 'Ῥαχάβ', 'βοῦς', 'γεννάω', 'Ἰωβήδ', 'Ῥούθ', 'Ἰωβήδ', 'γεννάω', 'Ἰεσσαί', 'Ἰεσσαί', 'γεννάω', 'Δαυίδ', 'βασιλεύς'], ['Δαυίδ', 'γεννάω', 'Σολομών', 'Οὐρίας', 'Σολομών', 'γεννάω', 'Ῥοβοάμ', 'Ῥοβοάμ', 'γεννάω', 'Ἀβιά', 'Ἀβιά', 'γεννάω', 'Ἀσάφ', 'Ἀσάφ', 'γεννάω', 'Ἰωσαφάτ', 'Ἰωσαφάτ', 'γεννάω', 'Ἰωράμ', 'Ἰωράμ', 'γεννάω', 'Ὀζίας', 'Ὀζίας', 'γεννάω', 'Ἰωαθάμ', 'Ἰωαθάμ', 'γεννάω', 'ἄχας', 'ἄχας', 'γεννάω', 'Ἑζεκίας', 'Ἑζεκίας', 'γεννάω', 'Μανασσῆς', 'Μανασσῆς', 'γεννάω', 'Ἀμώς', 'Ἀμώς', 'γεννάω', 'Ἰωσίας', 'Ἰωσίας', 'γεννάω', 'Ἰεχονίας', 'ἀδελφός', 'μετοικεσία', 'Βαβυλών']]


In [27]:
min_freq = 5

def get_vocab(docs, min_freq=5):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return words_flat, vocabulary # , vocab_freqs

In [28]:
words_flat_1to3, vocabulary_1to3 = get_vocab(sents_1to3)
words_flat_4to5, vocabulary_4to5 = get_vocab(sents_4to5)

In [29]:
print("Παῦλος" in vocabulary_1to3)
print("Παῦλος" in vocabulary_4to5)
print("ἀπόστολος" in vocabulary_1to3)
print("ἀπόστολος" in vocabulary_4to5)

True
True
True
True


In [30]:
words_flat_1to3[-100:]

['ὄμμα',
 'μένω',
 'ἀπειλή',
 'σκώληξ',
 'σῶμα',
 'ἀπουσία',
 'ἐπιστρεφόμενον',
 'ἐκβράσαν',
 'σῶμα',
 'ἐπιστρεφής',
 'οὗτος',
 'ἐκφεύγω',
 'θεός',
 'εἰμί',
 'διδαχθείς',
 'ἔχω',
 'ἀθάνατος',
 'σῶμα',
 'ἄφθαρτ',
 'ψυχή',
 'βασιλεία',
 'οὐρανός',
 'ἀπόληψις',
 'γῆ',
 'βίος',
 'ἐπουράνιος',
 'βασιλεύς',
 'ἐπιγνούς',
 'εἰμί',
 'ὁμιλητής',
 'θεός',
 'συγκληρονόμος',
 'Χριστός',
 'ἐπιθυμία',
 'πάθος',
 'νόσος',
 'δουλούμενος',
 'γίγνομαι',
 'θεός',
 'ὑπομένω',
 'πάθος',
 'ἄνθρωπος',
 'εἰμί',
 'δίδωμι',
 'ἄνθρωπος',
 'εἷς',
 'ὅσος',
 'παρακολουθέω',
 'θεός',
 'παρέχω',
 'ἐπαγγέλλομαι',
 'θεός',
 'ἐθεοποιήθης',
 'ἀθάνατος',
 'γεννηθείς',
 'τουτέστι',
 'γιγνώσκω',
 'ἐπιγιγνώσκω',
 'πεποιηκότα',
 'θεός',
 'ἐπιγιγνώσκω',
 'ἐπιγνωσθῆναι',
 'συμβαίνω',
 'καλουμένῳ',
 'φιλεχθρήσητε',
 'ἄνθρωπος',
 'παλινδρομεῖν',
 'διστάσητε',
 'Χριστός',
 'πᾶς',
 'θεός',
 'ἁμαρτία',
 'ἄνθρωπος',
 'ἀποπλύνω',
 'προστάσσω',
 'νέος',
 'παλαιός',
 'ἄνθρωπος',
 'ἀποτελῶν',
 'εἰκών',
 'καλέω',
 'ἀρχή',
 'τύπος',
 'ἐπιδε

In [31]:
print(len(words_flat_1to3))
print(len(words_flat_4to5))

695239
810827


In [32]:
def get_cooc(docs, vocabulary=None, min_freq=5):
    if vocabulary==None:
        lemmata, vocabulary = get_vocab(docs, min_freq)
    vec_bow = CountVectorizer(vocabulary=vocabulary, lowercase=False)
    bow = vec_bow.fit_transform([" ".join(list(set(sentence))) for sentence in docs])
    cooc  = bow.T * bow
    cooc = cooc / len(docs)
    return cooc, vocabulary

In [33]:
cooc_1to3, vocabulary_1to3 = get_cooc(sents_1to3, vocabulary=vocabulary_1to3[:2000])
cooc_4to5, vocabulary_4to5 = get_cooc(sents_4to5, vocabulary=vocabulary_4to5[:2000])

In [34]:
cooc_4to5.todense().shape

(2000, 2000)

In [35]:
def normalize_ppmi3_matrix(pmi_matrix_df):
    minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
    minval_2 = minval * 2
    pmi_matrix_df.fillna(minval_2, inplace=True)
    pmi_matrix_norm_df = (pmi_matrix_df - minval_2) / (maxval - minval_2)
    return pmi_matrix_norm_df

In [49]:
def get_ppmi_df(cooc, vocabulary, normalize=True, exp=1):
    pmi_rows_list = []
    for i in range(cooc.shape[1]):
        ab = np.array([row_el for row_el in list(cooc[i].toarray()[0])], dtype=float)
        ab_exp = np.power(ab, exp)
        axb = np.array([cooc[row_el[0], row_el[0]] * cooc[i, i] for row_el in enumerate(list(cooc[i].toarray()[0]))], dtype=float)
        pmi_row = np.divide(ab_exp, axb, out=np.zeros_like(ab_exp), where=axb!=0)
        pmi_row = [np.log(n) if n>0 else None for n in pmi_row]
        pmi_rows_list.append(pmi_row)
    pmi_matrix_df = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)
    if normalize == True:
        pmi_matrix_df = normalize_ppmi3_matrix(pmi_matrix_df)
        np.fill_diagonal(pmi_matrix_df.to_numpy(), 1)
    return pmi_matrix_df #pmi_matrix_norm_df

In [50]:
def svd_reduction(cooc_matrix, n_components=300, random_state=1, n_iter=100):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state, n_iter=n_iter)
    svd_matrix = svd.fit_transform(cooc_matrix)
    return svd_matrix

In [51]:
def from_docs_to_embeddings(docs, vocabulary=None, min_freq=5):
    cooc, vocabulary = get_cooc(docs, vocabulary=vocabulary, min_freq=min_freq)
    pmi_matrix = get_ppmi_df(cooc, vocabulary)
    word_vectors_array = svd_reduction(pmi_matrix, n_components=300, random_state=1, n_iter=100)
    word_vectors_df = pd.DataFrame(word_vectors_array, index=vocabulary)
    pmi_svd_cos = pd.DataFrame(cosine_similarity(word_vectors_array), columns=vocabulary, index=vocabulary)
    return [cooc, vocabulary, pmi_matrix, word_vectors_df, pmi_svd_cos]

In [52]:
%%time
data_1to3 = from_docs_to_embeddings(sents_1to3, vocabulary=vocabulary_1to3[:2000])
data_4to5 = from_docs_to_embeddings(sents_4to5, vocabulary=vocabulary_4to5[:2000])

CPU times: user 17min 13s, sys: 59.3 s, total: 18min 12s
Wall time: 4min 31s


In [54]:
data_1to3[-1]

Unnamed: 0,εἰμί,λέγω,θεός,οὗτος,γίγνομαι,λόγος,πᾶς,ἄνθρωπος,αὐτός,ἔχω,...,χολή,συγγένεια,ἐπιφαίνω,χρίω,λίμνη,ἄνοια,σιγάω,ἀναλίσκω,ὀκτώ,κόρος
εἰμί,1.000000,0.998141,0.998323,0.999159,0.998005,0.996254,0.997681,0.994673,0.997512,0.996940,...,0.369405,0.400348,0.437838,0.391623,0.384709,0.391891,0.374818,0.378586,0.349940,0.391205
λέγω,0.998141,1.000000,0.997137,0.998396,0.996780,0.995034,0.996017,0.993321,0.996687,0.995547,...,0.375992,0.398502,0.440093,0.394344,0.383892,0.387610,0.381950,0.378519,0.349337,0.384777
θεός,0.998323,0.997137,1.000000,0.998059,0.997308,0.995369,0.997929,0.994898,0.996347,0.995621,...,0.365405,0.400487,0.444337,0.393022,0.381799,0.392488,0.376571,0.380320,0.344683,0.388224
οὗτος,0.999159,0.998396,0.998059,1.000000,0.997624,0.996842,0.997424,0.994684,0.997466,0.996798,...,0.369606,0.400222,0.438676,0.390455,0.382191,0.392558,0.375043,0.376054,0.348919,0.391052
γίγνομαι,0.998005,0.996780,0.997308,0.997624,1.000000,0.994969,0.996563,0.993824,0.995595,0.995159,...,0.369035,0.402807,0.442673,0.394125,0.390936,0.392797,0.379658,0.382931,0.355364,0.387002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἄνοια,0.391891,0.387610,0.392488,0.392558,0.392797,0.395945,0.399042,0.398590,0.392535,0.389610,...,0.360666,0.394274,0.401683,0.322940,0.366715,1.000000,0.349005,0.322519,0.339740,0.519520
σιγάω,0.374818,0.381950,0.376571,0.375043,0.379658,0.384002,0.380710,0.381704,0.380521,0.374030,...,0.350373,0.432803,0.456298,0.380891,0.376296,0.349005,1.000000,0.392811,0.372819,0.292413
ἀναλίσκω,0.378586,0.378519,0.380320,0.376054,0.382931,0.379643,0.384724,0.387030,0.379469,0.379048,...,0.413345,0.373501,0.344293,0.361220,0.295441,0.322519,0.392811,1.000000,0.269675,0.315951
ὀκτώ,0.349940,0.349337,0.344683,0.348919,0.355364,0.348627,0.348518,0.345561,0.348724,0.344550,...,0.309067,0.423328,0.291289,0.352115,0.311331,0.339740,0.372819,0.269675,1.000000,0.277260


In [55]:
pickle.dump(data_1to3, open("../data/large_files/embeddings_ppmi_1to3.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
pickle.dump(data_4to5, open("../data/large_files/embeddings_ppmi_4to5.pkl", "wb"), pickle.HIGHEST_PROTOCOL)