In [25]:
import os
import json
import pandas as pd
import numpy as np
import json
import logging ### to monitor the code
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
from urllib.request import urlopen 
import io
import getpass
import re
from collections import defaultdict
from itertools import islice # to iterate through dicts

import nltk
from nltk.collocations import *

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

In [26]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [27]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

ECCE_AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KPpPaeX215HR_fVrakvJp8aB6oZDhHFTcBw0MKLw6as/edit?usp=sharing")

# Import the data

In [28]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_20201020.json", "df", conf)
AGT.head(5)

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,wordcount,lemmata,lemmata_wordcount,lemmatized_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica; Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,38822,"[σέο, φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκ...",24237,"[[σέο, φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσ..."
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War; Historiae in two volumes,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν ...,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,150160,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",68166,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers; Lives of Eminen...,ΒΙΩΝ ΚΑΙ ΓΝΩΜΩΝ ΤΩΝ ΕΝ ΦΙΛΟΣΟΦΙΑΙ ΕΥΔΟΚΙΜΗΣΑΝΤ...,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,111053,"[βία, ἀκή, γνώμη, ἕνος, φιλοσοφία, εὐδοκιμέω, ...",55580,"[[βία, ἀκή, γνώμη, ἕνος, φιλοσοφία, εὐδοκιμέω,..."
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls ; Idylls,Θύρσις ἢ ᾠδή Θύρσις ̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ...,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,19514,"[θύρσις, ἀοιδή, θύρσις, αδύ, ψιθύρισμα, πίτυς,...",12478,"[[θύρσις, ἀοιδή, θύρσις, αδύ, ψιθύρισμα, πίτυς..."
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams ; Idylls,α παλ ϝι.336 τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυ...,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,1807,"[παλ, ϝι, ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος...",1137,"[[παλ, ϝι], [ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖ..."


# δικ* over the whole corpus

In [35]:
# whole corpus as a list of lemmata
AGT_lemmata_list = np.concatenate(AGT["lemmata"].tolist())
len(AGT_lemmata_list)

14114036

In [37]:
dik_words = [word for word in AGT_lemmata_list if ("δικ" in word) or ("δίκ" in word)]
nltk.FreqDist(dik_words).most_common()

[('δίκαιος', 14828),
 ('δίκη', 6621),
 ('δικαιοσύνη', 4434),
 ('ἀδικέω', 4127),
 ('ἄδικος', 4000),
 ('δικαστής', 2450),
 ('ἀδικία', 2105),
 ('δικαστήριον', 1339),
 ('ἀδίκημα', 994),
 ('δικάζω', 932),
 ('δικαιόω', 544),
 ('Ἰνδικός', 525),
 ('παιδικός', 476),
 ('δίκτυον', 439),
 ('δικαίωμα', 330),
 ('Μηδικός', 306),
 ('δίκαιόν', 298),
 ('ἀντίδικος', 269),
 ('δίκαιός', 250),
 ('καταδικάζω', 246),
 ('δικτάτωρ', 236),
 ('δικαιόπολις', 223),
 ('καταδίκη', 215),
 ('ἐκδίκησις', 208),
 ('μοναδικός', 174),
 ('δικανικός', 169),
 ('εἰδικόν', 162),
 ('ἔνδικος', 153),
 ('πρόδικος', 146),
 ('ἐκδικέω', 116),
 ('δικαίωσις', 115),
 ('δικαιοπραγέω', 112),
 ('δικαστικός', 91),
 ('δίκαιʼ', 82),
 ('ἰνδικην', 77),
 ('δικαιώματά', 76),
 ('εἰδικώτατα', 76),
 ('ἔκδικος', 74),
 ('εἰδικώτατον', 72),
 ('ὑπόδικος', 70),
 ('ἰνδικοῦ', 69),
 ('ἰδικῶς', 68),
 ('δικαίωϲ', 65),
 ('περιοδικῶν', 62),
 ('ἰσχιαδικοῖς', 61),
 ('περδίκκας', 60),
 ('δικτάμνου', 60),
 ('δίκελλα', 59),
 ('δίκαιά', 59),
 ('εἰδικωτάτων', 58),
 ('πε

# Subcorpora

In [34]:
mask_pair_dict = {
    "AGT_christian" : {"column": "provenience", "value" : "christian"},
    "AGT_pagan" : {"column": "provenience", "value" : "pagan"},
    "AGT_philo" : {"column" : "author_id", "value" : "tlg0018"},
    "AGT_josephus" : {"column" : "author_id", "value" : "tlg0526"},
    "AGT_septuagint" : {"column" : "author_id", "value" : "tlg0527"},
    }

In [None]:
AGT_josephus = AGT[AGT["author_id"]=="tlg0018"]

In [33]:
AGT[AGT["author_id"]=="tlg0526"][["author", "title", "wordcount"]]

Unnamed: 0,author,title,wordcount
829,Flavius Josephus,Antiquitates Judaicae; Machine readable text; ...,312038
830,Flavius Josephus,Josephi vita; Machine readable text; Flavii Io...,15783
831,Flavius Josephus,Contra Apionem; Machine readable text; Flavii ...,22742
832,Flavius Josephus,De bello Judaico libri vii; Machine readable t...,125289


In [29]:
AGT_philo = AGT[AGT["author_id"]=="tlg0018"]
AGT_philo.head(5)

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,wordcount,lemmata,lemmata_wordcount,lemmatized_sentences
310,tlg0018.tlg001.opp-grc1.xml,Philo Judaeus,De opificio mundi; Philonis Alexandri Opera Qu...,ΦIΛΩΝΟΣ ΠΕΡΙ ΤΗΣ ΚΑTΑ ΜΩΥΣΕΑ ΚΟΣΜΟΠΟΙΙΑΣ I p. ...,tlg0018,tlg0018.tlg001,1 B.C.-A.D. 1,0.0,"{'-0.5': 0.5, '0.5': 0.5}",0.5,jewish,25373,"[φλωνος, καα, μωυσεα, κοσμοποιιας, νομοθέτης, ...",10201,"[[φλωνος, καα, μωυσεα, κοσμοποιιας], [νομοθέτη..."
311,tlg0018.tlg002.opp-grc1.xml,Philo Judaeus,Legum allegoriarum libri i‑iii; Philonis Alexa...,ΝΟΜΩΝ ΙΕΡΩΝ ΑΛΛΗΓΟΡΙΑΣ ΤΩΝ ΜΕΤΑ ΤΗΝ ΕΞΑΗΜΕΡΟΝ ...,tlg0018,tlg0018.tlg002,1 B.C.-A.D. 1,0.0,"{'-0.5': 0.5, '0.5': 0.5}",0.5,jewish,44765,"[νομή, ἱερή, αλληγοριας, ἄτη, εξαημερον, πρότε...",18468,"[[νομή, ἱερή, αλληγοριας, ἄτη, εξαημερον, πρότ..."
312,tlg0018.tlg003.opp-grc1.xml,Philo Judaeus,De Cherubim; Philonis Alexandri Opera Quae Sup...,ΠΕΡΙ ΤΩΝ ΧΕΡΟΥΒΙΜ ΚΑΙ ΤΗΣ ΦΛΟΓΙΝΗΣ ΡΟΜΦΑΙΑΣ KA...,tlg0018,tlg0018.tlg003,1 B.C.-A.D. 1,0.0,"{'-0.5': 0.5, '0.5': 0.5}",0.5,jewish,13060,"[χερουβιμ, ἀκή, φλογινης, ῥομφαία, κτισθεντος,...",5427,"[[χερουβιμ, ἀκή, φλογινης, ῥομφαία, κτισθεντος..."
313,tlg0018.tlg004.opp-grc1.xml,Philo Judaeus,De sacrificiis Abelis et Caini; Philonis Alexa...,ΠΕΡΙ ΓΕΝΕΣΕΩΣ ΑΒΕΛ ΚΑΙ ΩΝ ΑΥΤΟΣ TE ΚΑΙ ΑΔΕΛΦΟΣ...,tlg0018,tlg0018.tlg004,1 B.C.-A.D. 1,0.0,"{'-0.5': 0.5, '0.5': 0.5}",0.5,jewish,23976,"[γένεσις, αβελ, ἀκή, ὠνή, ἀκή, ἀδελφός, καινός...",7681,"[[γένεσις, αβελ, ἀκή, ὠνή, ἀκή, ἀδελφός, καινό..."
314,tlg0018.tlg005.opp-grc1.xml,Philo Judaeus,Quod deterius potiori insidiari soleat; Philon...,ΠΕΡΙ ΤΟΥ TO XEIPON ΤΩΙ KPEITTONI ΦIΛEIN EΠITΙΘ...,tlg0018,tlg0018.tlg005,1 B.C.-A.D. 1,0.0,"{'-0.5': 0.5, '0.5': 0.5}",0.5,jewish,16085,"[φλ, πιθσθαι, εἶπον, κάιν, ἀδελφός, διέρχομαι,...",6908,"[[φλ, πιθσθαι, εἶπον, κάιν, ἀδελφός, διέρχομαι..."


In [7]:
AGTc = AGT[AGT["provenience"]=="christian"]
AGTc.head()

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,wordcount
379,tlg0031.tlg001.perseus-grc2.xml,,New Testament - Matthew; Machine readable text...,ΚΑΤΑ ΜΑΘΘΑΙΟΝ ΒΙΒΛΟΣ γενέσεως Ἰησοῦ Χριστοῦ υἱ...,tlg0031a,tlg0031.tlg001,A.D. 1,0.5,{'0.5': 1},0.5,christian,18289
380,tlg0031.tlg002.perseus-grc2.xml,,New Testament - Mark; Machine readable text; T...,ΚΑΤΑ ΜΑΡΚΟΝ ΑΡΧΗ τοῦ εὐαγγελίου Ἰησοῦ Χριστοῦ ...,tlg0031b,tlg0031.tlg002,A.D. 1,0.5,{'0.5': 1},0.5,christian,11277
381,tlg0031.tlg003.perseus-grc2.xml,Luke (the evangelist),New Testament - Luke; Machine readable text; T...,ΚΑΤΑ ΛΟΥΚΑΝ ΕΠΕΙΔΗΠΕΡ ΠΟΛΛΟΙ ἐπεχείρησαν ἀνατά...,tlg0031luke,tlg0031.tlg003,A.D. 1,0.5,{'0.5': 1},0.5,christian,19460
382,tlg0031.tlg004.perseus-grc2.xml,Johnannine literature,New Testament - John; Machine readable text; T...,"ΚΑΤΑ ΙΩΑΝΗΝ ΕΝ ΑΡΧΗ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν...",tlg0031john,tlg0031.tlg004,A.D. 1,0.5,{'0.5': 1},0.5,christian,15592
383,tlg0031.tlg005.perseus-grc2.xml,Luke (the evangelist),New Testament - Acts; Machine readable text; T...,πράξεις ἀποστόλων τὸν μὲν πρῶτον λόγον ἐποιησά...,tlg0031luke,tlg0031.tlg005,A.D. 1,0.5,{'0.5': 1},0.5,christian,18408


In [8]:
len(AGTc)

147

In [13]:
AGTc.sort_values("date_avr", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
set_with_dataframe(ECCE_AGT_overview.add_worksheet("AGTc_works", cols=1, rows=1), AGTc.drop("string", axis=1)) 

In [14]:
len(AGTc["author_id"].unique().tolist())

48

In [15]:
AGTc["author"].unique().tolist()

[None,
 'Clemens Romanus',
 'Johnannine literature',
 'Paul of Tarsus',
 'Luke (the evangelist)',
 'Polycarp',
 'Ignatius Antiochenus',
 'Barnabas',
 'Athenagoras',
 'Justin Martyr',
 'Justinus Martyr',
 'Maximus of Tyre',
 'Anonymus',
 'Dionysius Corinthius',
 'Hermas, 2nd cent.',
 'Tatianus',
 'Theophilus Antiochenus',
 'Irenaeus',
 'Origen',
 'Origenes',
 'Clement of Alexandria',
 'Hippolytus',
 'Anonymous',
 'Gregorius Nazianzenus',
 'Basil, Saint, Bishop of Caesarea',
 'Pseudo-Justinus Martyr',
 'Eusebius',
 'Eusebius of Caesarea',
 'Epiphanius',
 'pseudo-Menander',
 'Athanasius of Alexandria',
 'Athanasius',
 'Theodoretus',
 'Philostorgius',
 'Theodoret, Bishop of Cyrus',
 'Socrates, Scholasticus',
 'Philip Edward Pusey',
 'Sozomenus, Salaminius Hermias',
 'Zosimus',
 'Evagrius, Scholasticus',
 'Joannes Philoponus']

# Dik* in raw string

In [24]:
AGTc.iloc[0]["string"].count(" δικ")

16