In [74]:
import pandas as pd
import nltk
import sddk
import numpy as np
import math

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [80]:
# for exporting data to googlesheets/sciencedata
# (feel free to skip)

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
paul_results = gc.open_by_url("https://docs.google.com/spreadsheets/d/1h4M-gK9TPIfeTV528tUuPBfZF1wtcNCA10yIlJYqGTE/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [59]:
LAGTec = pd.read_json("../data/large_files/LAGTec.json")

In [60]:
print(LAGTec["author"].unique().tolist())

['Gospel of Matthew', 'Gospel of Mark', 'Luke-Acts', 'Johnannine literature (New Testament)', 'Paul of Tarsus', 'Pauline literature', 'Hebrews (New Testament)', 'James (New Testament)', '1 Peter (New Testament)', '2 Peter (New Testament)', 'Jude (New Testament)', 'Revelation (New Testament)', 'Acta Joannis (recensio)', 'Clement of Alexandria', 'Maximus of Tyre', 'Justin Martyr', 'Pseudo-Justin Martyr', 'Athenagoras', 'Barnabas', 'Clement of Rome', 'Didache', 'Dionysius of Corinth', 'The Shepherd of Hermas', 'Ignatius of Antioch', 'Irenaeus', 'Martyrdom of Polycarp', 'Polycarp', 'Seniores Apud Irenaeum', 'Theophilus of Antioch', 'Tatianus', 'Eusebius', 'Epiphanius', 'Gregory of Nazianzus', 'Pappus Alexandrinus', 'Athanasius of Alexandria', 'Basil of Caesarea', 'Origen', 'Sozomen', 'Socrates of Constantinople', 'Philostorgius', 'Hippolytus of Rome', 'Mark the Deacon', 'Marcianus of Heraclea', 'Marinus', 'Colluthus', 'Zosimus', 'Theodoret', 'Cyril of Alexandria']


# Sentences as a whole

In [61]:
LAGTec["sentences_paul_wordlist"] = LAGTec["sentences_paul"].apply(lambda x: [w for s in x for w in s])

In [62]:
paul_wordlist = [w for wordlist in LAGTec["sentences_paul_wordlist"] for w in wordlist]

In [63]:
paul_wordlist_freqs = nltk.FreqDist(paul_wordlist).most_common()
paul_wordlist_freqs[:50]

[('Παῦλος', 1072),
 ('λέγω', 380),
 ('εἰμί', 215),
 ('θεός', 198),
 ('φημί', 167),
 ('οὗτος', 134),
 ('γίγνομαι', 130),
 ('Χριστός', 128),
 ('ἀπόστολος', 123),
 ('λόγος', 103),
 ('Ἰησοῦς', 103),
 ('πᾶς', 90),
 ('ἄνθρωπος', 79),
 ('ἅγιος', 74),
 ('πολύς', 68),
 ('ἐκκλησία', 64),
 ('κύριος', 63),
 ('ἔχω', 61),
 ('πνεῦμα', 56),
 ('αὐτός', 56),
 ('Ἰουδαῖος', 55),
 ('ἐπίσκοπος', 54),
 ('Πέτρος', 52),
 ('γράφω', 47),
 ('ποιέω', 44),
 ('ἄλλος', 43),
 ('ἀκούω', 39),
 ('μόνος', 39),
 ('υἱός', 38),
 ('τὶς', 38),
 ('ἐπιστολή', 38),
 ('σοφία', 38),
 ('πατήρ', 37),
 ('διδάσκω', 35),
 ('πίστις', 34),
 ('ὁράω', 32),
 ('ἀδελφός', 32),
 ('νόμος', 32),
 ('ἀνήρ', 31),
 ('οἶδα', 31),
 ('εὐαγγέλιον', 31),
 ('σῶμα', 31),
 ('ἔθνος', 30),
 ('Κορίνθιος', 30),
 ('πρῶτος', 29),
 ('πόλις', 29),
 ('ἡμέρα', 29),
 ('ὄνομα', 28),
 ('μέλλω', 28),
 ('δύναμις', 28)]

In [64]:
paul_wordlist_freqs_df = pd.DataFrame(paul_wordlist_freqs, columns=["word", "count"])
paul_wordlist_freqs_df[:100]

Unnamed: 0,word,count
0,Παῦλος,1072
1,λέγω,380
2,εἰμί,215
3,θεός,198
4,φημί,167
...,...,...
95,παρίστημι,16
96,λοιπός,16
97,νοέω,16
98,Ἰωάν(ν)ης,16


In [65]:
len(paul_wordlist_freqs_df[paul_wordlist_freqs_df["count"]>=5])

395

In [66]:
#set_with_dataframe(paul_results.worksheet("paul_sentences_wordfreqs"), paul_wordlist_freqs_df[paul_wordlist_freqs_df["count"]>=5])

# Analysis by authors

In [67]:
LAGT_metadata = pd.read_csv("../data/LAGT_metadata.csv")

In [68]:
df = LAGTec[(LAGTec["author_id"]!="tlg0031paul")]#  ["sentences_paul_N"].sum()

df_authors = df.groupby("author_id").sum()[["wordcount", "n_sentences", "paul_N","sentences_paul_N", "sentences_apostle_N"]].sort_values("sentences_paul_N", ascending=False)#
author_dict = dict(zip(LAGT_metadata["author_id"],  LAGT_metadata["author"]))
def get_author(x):
    try:
        return author_dict[x]
    except:
        return None

date_dict = dict(zip(LAGT_metadata["author_id"],  LAGT_metadata["date_avr"]))

def get_list_of_works(author_id):
    works = LAGTec[LAGTec["author_id"]==author_id]["title"].tolist()
    return works

def get_flat_sentences(author_id):
    sents_flat = [sent for work in LAGTec[LAGTec["author_id"]==author_id]["lemmatized_sentences"].tolist() for sent in work]
    return sents_flat

def get_flat_words(author_id, sents_col):
    sents_flat = [sent for work in LAGTec[LAGTec["author_id"]==author_id][sents_col].tolist() for sent in work]
    words_flat = [w for s in sents_flat for w in s]
    return words_flat


df_authors.reset_index(inplace=True)
df_authors["author"] = df_authors["author_id"].apply(get_author)
df_authors["works"] = df_authors["author_id"].apply(get_list_of_works)
df_authors["works_N"] = df_authors["works"].apply(len)
df_authors["date_avr"] = df_authors["author_id"].apply(lambda x: date_dict[x])

df_authors["lemmatized_sentences"] = df_authors["author_id"].apply(get_flat_sentences)
df_authors["words_paul"] = df_authors["author_id"].apply(lambda x: get_flat_words(x, "sentences_paul"))
df_authors["words_paul"] = df_authors["words_paul"].apply(lambda x: [w for w in x if w != "Παῦλος"])



df_authors.set_index("author", inplace=True)
df_authors.reset_index(inplace=True)

df_authors["paul_proportion"] = df_authors["sentences_paul_N"] / df_authors["n_sentences"]
df_authors

Unnamed: 0,author,author_id,wordcount,n_sentences,paul_N,sentences_paul_N,sentences_apostle_N,works,works_N,date_avr,lemmatized_sentences,words_paul,paul_proportion
0,Origen,tlg2042,931431,110127,471,459,526,"[Contra Celsum, Commentarii in Evangelium Joan...",44,2.0,"[[σωτήρ, κύριος, Ἰησοῦς, Χριστός, ψευδομαρτυρο...","[μυρίος, καταλέγω, εἰωθότα, χωρίζω, ἀγάπη, Χρι...",0.004168
1,Luke-Acts,tlg0031luke,37864,2234,128,125,32,"[Gospel of Luke, Acts]",2,0.5,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...","[διέρχομαι, ὅλος, νῆσος, Πάφος, εὑρίσκω, ἀνήρ,...",0.055953
2,Epiphanius,tlg2021,325038,33079,108,102,357,"[Ancoratus, Panarion (Adversus haereses), Anac...",3,3.5,"[[θεῖος, μέγας, πατήρ, ἐπιφάνιος, ἐλευθεροπόλε...","[συνᾴδω, λόγος, λέγω, ναός, θεός, πνεῦμα, θεός...",0.003084
3,Socrates of Constantinople,tlg2057,103901,8743,77,76,64,[Historia Ecclesiastica],1,4.0,"[[βιβλίον, προοίμιον], [εὐσέβιος, παμφίλου, ὅλ...","[λέγω, πείθω, γίγνομαι, γράφοντος, δἰ, ὅς, πᾶς...",0.008693
4,Sozomen,tlg2048,112966,7150,63,63,30,[Historia Ecclesiastica],1,4.5,"[[Φᾶσις, αὐτοκράτωρ, ἐπιμελής, χρῆμα, γίγνομαι...","[μέγας, ἀντωνίου, ἅγιος, ἁπλόω, γίγνομαι, εὐδό...",0.008811
5,Athanasius of Alexandria,tlg2035,94010,10789,49,49,90,[De decretis Nicaenae synodi (Chapters 41 and ...,5,3.5,"[[ἀρτέομαι, δίνη, συναγομένων], [θεός, θεός, ε...","[λέγω, ὑπερυψόω, καθίζω, δεξιᾷʼ, λεκτέος, Ἀθήν...",0.004542
6,Theodoret,tlg4089,153333,16094,42,40,36,"[Historia ecclesiastica , Historia Religiosa]",2,4.0,"[[ἔνειμι, πρῶτος, τόμῳ, θεοδωρήτου, ἐκκλησιαστ...","[προτρεπόμενον, οὗτος, βιάζω, Πέτρος, μηδείς, ...",0.002485
7,Basil of Caesarea,tlg2040,144429,11257,25,23,36,"[To Young Men, On How They Might Dervice Profi...",2,3.5,"[[ἀπειρηκότα, λεγομένηςλεγομένης], [τύχη, ἐπήρ...","[οἷος, φέρω, λέγω, μόνος, ἁρπάζω, τρίτος, οὐρα...",0.002043
8,Clement of Alexandria,tlg0555,235754,14753,18,18,101,"[Protrepticus, Paedagogus, Stromata (Books I-V...",7,2.0,"[[ἀμφίων, Θηβαῖος, Ἄριοι, μηθυμναῖος, ἄμφω, εἰ...","[θεοσέβεια, πᾶς, ὠφέλιμοσ, ἐπαγγελία, ἔχω, ζωή...",0.00122
9,Eusebius,tlg2018,620546,35220,17,17,199,"[Praeperatio Evangelica, Historia ecclesiastic...",10,3.5,"[[χριστιανισον, εἰμί, ἠγούμενος, οἶδα, παρίστη...","[πρῶτος, πᾶς, ἱερός, ἀπόστολος, ἀπατηλὰς, σοφι...",0.000483


In [69]:
def get_tf(lemmata):
    fd_list = list(nltk.FreqDist(lemmata).most_common())
    tf_list = [(tup[0], np.round(tup[1] / len(lemmata), 5)) for tup in fd_list]
    return tf_list

df_authors["paul_sents_TF"] = df_authors["words_paul"].apply(get_tf)

In [70]:
df_authors[df_authors["paul_N"]>10].sort_values("date_avr")

Unnamed: 0,author,author_id,wordcount,n_sentences,paul_N,sentences_paul_N,sentences_apostle_N,works,works_N,date_avr,lemmatized_sentences,words_paul,paul_proportion,paul_sents_TF
1,Luke-Acts,tlg0031luke,37864,2234,128,125,32,"[Gospel of Luke, Acts]",2,0.5,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...","[διέρχομαι, ὅλος, νῆσος, Πάφος, εὑρίσκω, ἀνήρ,...",0.055953,"[(λέγω, 0.03648), (Ἰουδαῖος, 0.01858), (ἀνήρ, ..."
11,Irenaeus,tlg1447,24651,4525,12,12,17,[Libros quinque adversus haereses],1,1.5,"[[ἀλήθεια, παραπεμπόμενοί], [ἐπεισάγουσι, λόγο...","[φανερός, λέγω, ὅδε, αἰών, ὀνομάζω, τάξις, τηρ...",0.002652,"[(λέγω, 0.20339), (αἰών, 0.05085), (ἀπόστολος,..."
0,Origen,tlg2042,931431,110127,471,459,526,"[Contra Celsum, Commentarii in Evangelium Joan...",44,2.0,"[[σωτήρ, κύριος, Ἰησοῦς, Χριστός, ψευδομαρτυρο...","[μυρίος, καταλέγω, εἰωθότα, χωρίζω, ἀγάπη, Χρι...",0.004168,"[(λέγω, 0.04001), (φημί, 0.02774), (εἰμί, 0.02..."
8,Clement of Alexandria,tlg0555,235754,14753,18,18,101,"[Protrepticus, Paedagogus, Stromata (Books I-V...",7,2.0,"[[ἀμφίων, Θηβαῖος, Ἄριοι, μηθυμναῖος, ἄμφω, εἰ...","[θεοσέβεια, πᾶς, ὠφέλιμοσ, ἐπαγγελία, ἔχω, ζωή...",0.00122,"[(θεός, 0.05208), (Χριστός, 0.02604), (φημί, 0..."
2,Epiphanius,tlg2021,325038,33079,108,102,357,"[Ancoratus, Panarion (Adversus haereses), Anac...",3,3.5,"[[θεῖος, μέγας, πατήρ, ἐπιφάνιος, ἐλευθεροπόλε...","[συνᾴδω, λόγος, λέγω, ναός, θεός, πνεῦμα, θεός...",0.003084,"[(λέγω, 0.04825), (ἀπόστολος, 0.02632), (εἰμί,..."
5,Athanasius of Alexandria,tlg2035,94010,10789,49,49,90,[De decretis Nicaenae synodi (Chapters 41 and ...,5,3.5,"[[ἀρτέομαι, δίνη, συναγομένων], [θεός, θεός, ε...","[λέγω, ὑπερυψόω, καθίζω, δεξιᾷʼ, λεκτέος, Ἀθήν...",0.004542,"[(λέγω, 0.05488), (εἰμί, 0.03963), (θεός, 0.03..."
7,Basil of Caesarea,tlg2040,144429,11257,25,23,36,"[To Young Men, On How They Might Dervice Profi...",2,3.5,"[[ἀπειρηκότα, λεγομένηςλεγομένης], [τύχη, ἐπήρ...","[οἷος, φέρω, λέγω, μόνος, ἁρπάζω, τρίτος, οὐρα...",0.002043,"[(λέγω, 0.0393), (θεός, 0.0262), (κύριος, 0.02..."
9,Eusebius,tlg2018,620546,35220,17,17,199,"[Praeperatio Evangelica, Historia ecclesiastic...",10,3.5,"[[χριστιανισον, εἰμί, ἠγούμενος, οἶδα, παρίστη...","[πρῶτος, πᾶς, ἱερός, ἀπόστολος, ἀπατηλὰς, σοφι...",0.000483,"[(λέγω, 0.03521), (εἰμί, 0.03521), (θεός, 0.02..."
10,Gregory of Nazianzus,tlg2022,77297,20005,14,14,2,[Christus patiens [Dub.] (fort. auctore Consta...,7,3.5,"[[στίχος, γρηγορίου, θεολόγου, σωτήριον, πάθος...","[ἐπιτιμάτω, ὀνειδίζω, ἀπαρίθμησις, χάρισμα, φη...",0.0007,"[(λέγω, 0.06024), (θεός, 0.04819), (ἀκούω, 0.0..."
3,Socrates of Constantinople,tlg2057,103901,8743,77,76,64,[Historia Ecclesiastica],1,4.0,"[[βιβλίον, προοίμιον], [εὐσέβιος, παμφίλου, ὅλ...","[λέγω, πείθω, γίγνομαι, γράφοντος, δἰ, ὅς, πᾶς...",0.008693,"[(ἐπίσκοπος, 0.0243), (γίγνομαι, 0.02066), (ἐκ..."


In [75]:
def term_idf(term, all_sentences):
    idf_raw =  len(all_sentences) / len([sent for sent in all_sentences if term in sent])
    idf_log = math.log(idf_raw)
    return idf_log

def get_tfidf(key_term_lemmata, all_sentences):
    fd_list = list(nltk.FreqDist(key_term_lemmata).most_common())
    tfidf_list = [(tup[0], np.round((tup[1] / len(key_term_lemmata) * term_idf(tup[0], all_sentences)), 5)) for tup in fd_list]
    tfidf_list = sorted(tfidf_list, key=lambda tup: tup[1], reverse=True)
    return tfidf_list

df_authors["paul_sents_TFIDF"] = df_authors.apply(lambda x: get_tfidf(x["words_paul"], x["lemmatized_sentences"]), axis=1)#%%

In [78]:
df_authors["paul_sents_TF_10"] = df_authors["paul_sents_TF"].apply(lambda cell: ", ".join([tup[0] for tup in cell[:10]]))
df_authors["paul_sents_TFIDF_10"] = df_authors["paul_sents_TFIDF"].apply(lambda cell: ", ".join([tup[0] for tup in cell[:10]]))

In [76]:
df_authors

Unnamed: 0,author,author_id,wordcount,n_sentences,paul_N,sentences_paul_N,sentences_apostle_N,works,works_N,date_avr,lemmatized_sentences,words_paul,paul_proportion,paul_sents_TF,paul_sents_TFIDF
0,Origen,tlg2042,931431,110127,471,459,526,"[Contra Celsum, Commentarii in Evangelium Joan...",44,2.0,"[[σωτήρ, κύριος, Ἰησοῦς, Χριστός, ψευδομαρτυρο...","[μυρίος, καταλέγω, εἰωθότα, χωρίζω, ἀγάπη, Χρι...",0.004168,"[(λέγω, 0.04001), (φημί, 0.02774), (εἰμί, 0.02...","[(φημί, 0.08098), (λέγω, 0.07512), (εἰμί, 0.04..."
1,Luke-Acts,tlg0031luke,37864,2234,128,125,32,"[Gospel of Luke, Acts]",2,0.5,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...","[διέρχομαι, ὅλος, νῆσος, Πάφος, εὑρίσκω, ἀνήρ,...",0.055953,"[(λέγω, 0.03648), (Ἰουδαῖος, 0.01858), (ἀνήρ, ...","[(Ἰουδαῖος, 0.06232), (λέγω, 0.04318), (ἀνήρ, ..."
2,Epiphanius,tlg2021,325038,33079,108,102,357,"[Ancoratus, Panarion (Adversus haereses), Anac...",3,3.5,"[[θεῖος, μέγας, πατήρ, ἐπιφάνιος, ἐλευθεροπόλε...","[συνᾴδω, λόγος, λέγω, ναός, θεός, πνεῦμα, θεός...",0.003084,"[(λέγω, 0.04825), (ἀπόστολος, 0.02632), (εἰμί,...","[(ἀπόστολος, 0.10171), (λέγω, 0.08825), (Πέτρο..."
3,Socrates of Constantinople,tlg2057,103901,8743,77,76,64,[Historia Ecclesiastica],1,4.0,"[[βιβλίον, προοίμιον], [εὐσέβιος, παμφίλου, ὅλ...","[λέγω, πείθω, γίγνομαι, γράφοντος, δἰ, ὅς, πᾶς...",0.008693,"[(ἐπίσκοπος, 0.0243), (γίγνομαι, 0.02066), (ἐκ...","[(ἐπίσκοπος, 0.06045), (ἑσπέριος, 0.0481), (ἐκ..."
4,Sozomen,tlg2048,112966,7150,63,63,30,[Historia Ecclesiastica],1,4.5,"[[Φᾶσις, αὐτοκράτωρ, ἐπιμελής, χρῆμα, γίγνομαι...","[μέγας, ἀντωνίου, ἅγιος, ἁπλόω, γίγνομαι, εὐδό...",0.008811,"[(ἐκκλησία, 0.02191), (γίγνομαι, 0.01878), (ἐπ...","[(ἀπόστολος, 0.05994), (κωνσταντινουπόλεως, 0...."
5,Athanasius of Alexandria,tlg2035,94010,10789,49,49,90,[De decretis Nicaenae synodi (Chapters 41 and ...,5,3.5,"[[ἀρτέομαι, δίνη, συναγομένων], [θεός, θεός, ε...","[λέγω, ὑπερυψόω, καθίζω, δεξιᾷʼ, λεκτέος, Ἀθήν...",0.004542,"[(λέγω, 0.05488), (εἰμί, 0.03963), (θεός, 0.03...","[(λέγω, 0.08583), (γράφω, 0.07972), (θεός, 0.0..."
6,Theodoret,tlg4089,153333,16094,42,40,36,"[Historia ecclesiastica , Historia Religiosa]",2,4.0,"[[ἔνειμι, πρῶτος, τόμῳ, θεοδωρήτου, ἐκκλησιαστ...","[προτρεπόμενον, οὗτος, βιάζω, Πέτρος, μηδείς, ...",0.002485,"[(οὗτος, 0.025), (ἐπίσκοπος, 0.0225), (λέγω, 0...","[(ἐπίσκοπος, 0.08085), (ζευγματέων, 0.06365), ..."
7,Basil of Caesarea,tlg2040,144429,11257,25,23,36,"[To Young Men, On How They Might Dervice Profi...",2,3.5,"[[ἀπειρηκότα, λεγομένηςλεγομένης], [τύχη, ἐπήρ...","[οἷος, φέρω, λέγω, μόνος, ἁρπάζω, τρίτος, οὐρα...",0.002043,"[(λέγω, 0.0393), (θεός, 0.0262), (κύριος, 0.02...","[(Τιμόθεος, 0.11817), (λέγω, 0.11344), (Σιλουα..."
8,Clement of Alexandria,tlg0555,235754,14753,18,18,101,"[Protrepticus, Paedagogus, Stromata (Books I-V...",7,2.0,"[[ἀμφίων, Θηβαῖος, Ἄριοι, μηθυμναῖος, ἄμφω, εἰ...","[θεοσέβεια, πᾶς, ὠφέλιμοσ, ἐπαγγελία, ἔχω, ζωή...",0.00122,"[(θεός, 0.05208), (Χριστός, 0.02604), (φημί, 0...","[(θεός, 0.11349), (Χριστός, 0.0992), (νόμος, 0..."
9,Eusebius,tlg2018,620546,35220,17,17,199,"[Praeperatio Evangelica, Historia ecclesiastic...",10,3.5,"[[χριστιανισον, εἰμί, ἠγούμενος, οἶδα, παρίστη...","[πρῶτος, πᾶς, ἱερός, ἀπόστολος, ἀπατηλὰς, σοφι...",0.000483,"[(λέγω, 0.03521), (εἰμί, 0.03521), (θεός, 0.02...","[(σοφία, 0.08125), (δόξα, 0.07693), (λέγω, 0.0..."


In [81]:
columns = ['author', 'author_id', 'works_N', 'works', 'wordcount', 'n_sentences',
           'date_avr','paul_N', 'sentences_apostle_N', 'sentences_paul_N', 'paul_proportion', "paul_sents_TF_10", "paul_sents_TFIDF_10"]

set_with_dataframe(paul_results.worksheet("christian_authors_overview"), df_authors[columns])