In [1]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [2]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [4]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [5]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

ECCE_AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KPpPaeX215HR_fVrakvJp8aB6oZDhHFTcBw0MKLw6as/edit?usp=sharing")

In [6]:
publicfolder = "31b393e2afe1ee96ce81869c7efe18cb"
c_hippocraticum = sddk.read_file("c_hippocraticum_enriched.json", "df", publicfolder)

reading file located in a public folder


# Import and preprocess corpus

In [61]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_20201109.json", "df", conf)
AGT.head(5)

Unnamed: 0,filename,author,title,string,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,sentences,lemmata,lemmata_wordcount,lemmatized_sentences,n_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,"[ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν ...","[σέο, φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκ...",24237,"[[σέο, φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσ...",3233
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,\nΘουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶ...,150126,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,[Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",68154,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",6068
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,110773,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,[Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβά...,"[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βάρβαρος, ἄρχ...",55323,"[[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βάρβαρος, ἄρ...",10244
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,\n̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα...,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,"[̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...","[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότη...",12177,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότ...",1733
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,[τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ...,"[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλο...",1089,"[[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλ...",137


In [63]:
subcorpora_dict = {# main subcorpora
    "Archaic (8-6 BCE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"] <= -5.5)],
    "Classical (5-4 BCE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"].between(-5, -3))],
    "Non-Christian (1-4 CE)" : AGT[(AGT["date_avr"].between(0, 4)) & (AGT["provenience"]=="pagan")],
    "Septuagint" : AGT[AGT["author_id"]=="tlg0527"],
    "Christian (1-4 CE)" : AGT[(AGT["date_avr"] < 4) & (AGT["provenience"]=="christian")],
    "Christian (1-1/2 CE)" : AGT[(AGT["date_avr"] < 1.5) & (AGT["provenience"]=="christian")],
    "Christian (2-3/4 CE)" : AGT[(AGT["date_avr"].between(1.5, 3)) & (AGT["provenience"]=="christian")],
    "Christian (4-4/5 CE)" : AGT[(AGT["date_avr"].between(3.5, 4)) & (AGT["provenience"]=="christian")]}

subcorpora_dict_other = {
    "Pagan (8th c. BCE - 4th c. CE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"] < 4)],
    "Pagan (1st - 4th c. CE)" : AGT[(AGT["date_avr"].between(0, 4)) & (AGT["provenience"]=="pagan")],
    "Jewish" : AGT[AGT["author_id"].isin(["tlg0527", "tlg0018", "tlg0526"])],
    "Septuagint" : AGT[AGT["author_id"]=="tlg0527"],
    "Philo of Alexandria" : AGT[AGT["author_id"]=="tlg0018"],
    "Titus Flavius Josephus" : AGT[AGT["author_id"]=="tlg0526"],
    "Paul of Tarsus" : AGT[AGT["author_id"]=="tlg0031paul"],
    "New Testament" : AGT[AGT["author_id"].str.startswith("tlg0031")],
    "Christian (1st - 4th c. CE)" : AGT[(AGT["date_avr"] < 4) & (AGT["provenience"]=="christian")]}    


In [65]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [66]:
subselections = []
for key in subcorpora_dict.keys():
    subselection = subcorpora_dict[key]
    subselection_dict = {}
    wordcount = subselection["wordcount"].sum()
    doccount = len(subselection)
    sents = get_flat_sentences(subselection["lemmatized_sentences"])
    sents_len = len(sents)
    subselection_dict.update({"label" : key, "wordcount" : wordcount, "doccount" : doccount, "n_sentences": sents_len,"sentences" : sents})
    subselections.append(subselection_dict)
subselections_df = pd.DataFrame(subselections)
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Archaic (8-6 BCE),264986,11,20444,"[[μῆνις, ἀείδω, θεά, πηληϊάδεω, Ἀχιλλεύς, οὐλό..."
1,Classical (5-4 BCE),3439210,338,256926,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,Non-Christian (1-4 CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γνῶσ..."
3,Septuagint,872013,55,53330,"[[ἕνος, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατο..."
4,Christian (1-4 CE),3366042,131,286782,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε..."
5,Christian (1-1/2 CE),171389,38,11959,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε..."
6,Christian (2-3/4 CE),1792275,65,165245,"[[ἀμφίων, Θηβαῖος, ἄριοι, μηθυμναῖος, ἄμφω, ᾠδ..."
7,Christian (4-4/5 CE),1746266,33,141538,"[[ἐπειδη, ὅρος, κράτιστος, διόγνητε, ὑπερεσπου..."


In [67]:
AGT_docs = get_flat_sentences(AGT["lemmatized_sentences"])

In [68]:
print(AGT_docs[:10])

[['σέο', 'φοῖβος', 'παλαιγενής', 'κλέος', 'φάος', 'μιμνήσκω', 'οἳ', 'πόντος', 'στόμα', 'πέτρα', 'Κυάνεαι', 'βασιλεύς', 'ἐφημοσύνη', 'πελίαο', 'χρυσεῖον', 'κῶας', 'εὔζυγος', 'ἐλαύνω', 'ἀργός'], ['τοῖος', 'πελιός', 'φάτις', 'ἐκλύω', 'μοῖρα', 'μένος', 'στυγερός', 'τοῦδʼ', 'ἀνήρ', 'ὅντινʼ', 'εἶδον', 'οἰοπέδιλον', 'ὑπʼ', 'ἐνεσία', 'δαμάζω'], ['δηρός', 'τεός', 'βάξις', 'Ἰησοῦς', 'χειμέριος', 'ῥεῖθρον', 'κιὼν', 'πόσις', 'Ἄναυρος', 'ἐξεσάωσεν', 'ὑπʼ', 'ἰλύς', 'καταλιμπάνω', 'πέδιλον', 'προχόη'], ['ἱκνέομαι', 'πελιός', 'ἀντιβολέω', 'εἰλαπίνη', 'ἣν', 'πατήρ', 'Ποσειδεών', 'ῥέζω', 'θεός', 'Ἥρα', 'Πελασγιῶται', 'ἀλεγίζω'], ['τόνγʼ', 'ἐσιδὼν', 'φράζω', 'ἆθλον', 'ἔντυε', 'ναυτιλία', 'πολυκηδής', 'ὄφρʼ', 'πόντος', 'ἀλλοδαπός', 'μετʼ', 'ἀνήρ', 'νόστος', 'ὄλλυμι'], ['ναῦς', 'ἐπικλείω', 'ἀοιδός', 'ἀργός', 'Ἀθηναῖος', 'κάμνω', 'ὑποθημοσύνη'], ['γενεά', 'ὄνομα', 'μυθέομαι', 'ἥρως', 'δολιχός', 'πόρος', 'ἅλς', 'ὅσος', 'τʼ', 'ἐρέσσω'], ['Μοῦσα', 'ὑποφήτωρ', 'ἀοιδή'], ['πρῶτά', 'Ὀρφεύς', 'μιμνήσκω', 'ῥαίνω', 

In [69]:
#create gensim dictionary for our list of sentences
dictionary = corpora.Dictionary(AGT_docs)

In [70]:
# as such, it is organized by ids
dict(list(dictionary.items())[:10])

{0: 'Κυάνεαι',
 1: 'βασιλεύς',
 2: 'εὔζυγος',
 3: 'κλέος',
 4: 'κῶας',
 5: 'μιμνήσκω',
 6: 'οἳ',
 7: 'πέτρα',
 8: 'παλαιγενής',
 9: 'πελίαο'}

In [71]:
dict(list(dictionary.token2id.items())[:10])

{'Κυάνεαι': 0,
 'βασιλεύς': 1,
 'εὔζυγος': 2,
 'κλέος': 3,
 'κῶας': 4,
 'μιμνήσκω': 5,
 'οἳ': 6,
 'πέτρα': 7,
 'παλαιγενής': 8,
 'πελίαο': 9}

In [72]:
# look at one sentence
AGT_docs[10000]

['βία', 'τευτάμου', 'πριηνεύς', 'προκεκριμένος', 'Σάτυρος']

In [73]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(AGT_docs[10000])

[(410, 1), (13345, 1), (13493, 1), (13494, 1), (13495, 1)]

In [74]:
# document to id method
dictionary.doc2idx(AGT_docs[10000])

[410, 13495, 13493, 13494, 13345]

In [75]:
sent = docs[20]
sent

['ἐξέρχομαι', 'ποιμανεῖ', 'λαός', 'ἁμός', 'ἰσραήλ']

In [76]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(sent)

[(815, 1), (1826, 1), (4018, 1), (110110, 1), (149859, 1)]

# LSA with  sklearn

In [77]:
%%time
# we can use our gensim dictionary

# as a corpus, we cannot use the Gensim default BoW model,

# we just need the words replaced by values

corpus_idx = [dictionary.doc2idx(sent) for sent in AGT_docs]

CPU times: user 8.15 s, sys: 123 ms, total: 8.28 s
Wall time: 8.27 s


In [78]:
len(corpus_idx)

2868368

In [79]:
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Archaic (8-6 BCE),264986,11,20444,"[[μῆνις, ἀείδω, θεά, πηληϊάδεω, Ἀχιλλεύς, οὐλό..."
1,Classical (5-4 BCE),3439210,338,256926,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,Non-Christian (1-4 CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γνῶσ..."
3,Septuagint,872013,55,53330,"[[ἕνος, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατο..."
4,Christian (1-4 CE),3366042,131,286782,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε..."
5,Christian (1-1/2 CE),171389,38,11959,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε..."
6,Christian (2-3/4 CE),1792275,65,165245,"[[ἀμφίων, Θηβαῖος, ἄριοι, μηθυμναῖος, ἄμφω, ᾠδ..."
7,Christian (4-4/5 CE),1746266,33,141538,"[[ἐπειδη, ὅρος, κράτιστος, διόγνητε, ὑπερεσπου..."


In [80]:
subselections_df["idx"] = subselections_df["sentences"].apply(lambda sentences: [dictionary.doc2idx(sent) for sent in sentences])

In [81]:
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences,idx
0,Archaic (8-6 BCE),264986,11,20444,"[[μῆνις, ἀείδω, θεά, πηληϊάδεω, Ἀχιλλεύς, οὐλό...","[[2546, 1285, 677, 94436, 1401, 1833, 1193, 10..."
1,Classical (5-4 BCE),3439210,338,256926,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...","[[5986, 74, 5989, 511, 5984, 5992, 5988, 1200,..."
2,Non-Christian (1-4 CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γνῶσ...","[[31555, 31557, 31556, 13882, 7064, 3876, 5998..."
3,Septuagint,872013,55,53330,"[[ἕνος, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατο...","[[3252, 2532, 51, 1284, 3001], [3001, 19377, 1..."
4,Christian (1-4 CE),3366042,131,286782,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε...","[[17098, 12666, 46, 25204, 198, 149806, 198, 1..."
5,Christian (1-1/2 CE),171389,38,11959,"[[βίβλος, γένεσις, Ἰησοῦς, χριστός, υἱός, δαυε...","[[17098, 12666, 46, 25204, 198, 149806, 198, 1..."
6,Christian (2-3/4 CE),1792275,65,165245,"[[ἀμφίων, Θηβαῖος, ἄριοι, μηθυμναῖος, ἄμφω, ᾠδ...","[[564, 6581, 54073, 82234, 143, 395034, 1251, ..."
7,Christian (4-4/5 CE),1746266,33,141538,"[[ἐπειδη, ὅρος, κράτιστος, διόγνητε, ὑπερεσπου...","[[3363, 1979, 6032, 148282, 451714, 109366, 13..."


In [89]:
christian_docs = subselections_df.at[4, "sentences"]

In [106]:
def lsa_model(docs):
    vocabulary =  list(set([word for doc in docs for word in doc]))
    vec =  TfidfVectorizer(vocabulary=vocabulary) ### initiaze the model
    X = vec.fit_transform([" ".join(doc) for doc in docs]) ### run the model
    Xc = (X.T * X)
    svd = TruncatedSVD(n_components=50, n_iter=5, random_state=42)
    svd.fit(Xc)
    #term2doc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
    lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())
    return lsa_model_data, Xc#term2doc

In [107]:
model, term2doc = lsa_model(christian_docs)

In [108]:
len(model.columns)

117515

In [102]:
def get_most_similar(model_df, target_term, number):
  all_similar = []
  for term in model_df.columns:
    similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
    all_similar.append(similarity)
  return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

In [103]:
words = ["δίκαιος", "δίκη", "δικαιοσύνη"]

In [109]:
neighbours = {}  
for word in words:
    neighbours[word] = get_most_similar(model, word, 10)

In [110]:
pd.DataFrame(neighbours)

Unnamed: 0,δίκαιος,δίκη,δικαιοσύνη
0,"(ιειμένην, 0.9834371832818561)","(μετελεύσονται, 0.9384522490347709)","(καθυπηρετεῖ, 0.9710013738851778)"
1,"(εὐστοχίου, 0.9830720889621722)","(ἐπικουροί, 0.9384522490347709)","(ἐνδύσονται, 0.9706819000158255)"
2,"(λογιωτάτου, 0.9830720889621722)","(ἐριννύες, 0.9384522490347709)","(σιοί, 0.9706819000158255)"
3,"(λελύπηται, 0.9830720889621722)","(καταπεπηγμένων, 0.9344969432505092)","(περικτᾶται, 0.9699396541878471)"
4,"(τηρούσης, 0.9827599613491105)","(ἐξέτισας, 0.9182233983592916)","(ἡδυπαθείᾳ, 0.9699396541878471)"
5,"(μιμείσθω, 0.9826270670854331)","(γλωσσαργία, 0.8999555833389017)","(ἅνευ, 0.9695269024725653)"
6,"(ἰὼδ, 0.9822301096106603)","(κρυσταλλοειδεῖ, 0.8997426356399815)","(χρωννύντες, 0.9695269024725653)"
7,"(ἐπαράντων, 0.9820947343036778)","(διώκτας, 0.8970726088109655)","(πρυπορεύσεται, 0.9662484878532644)"
8,"(πειλθαρχεῖν, 0.9820947343036778)","(προςώποις, 0.8861867911971005)","(ἐπιφαινέσθω, 0.9660042345965665)"
