In [3]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

!pip install sddk
import sddk

Collecting sddk
  Downloading https://files.pythonhosted.org/packages/f9/18/6661220dac22b68b120cac3ce8168a7b5d88a9c84ee524a29d62200ff970/sddk-2.7-py3-none-any.whl
Installing collected packages: sddk
Successfully installed sddk-2.7


In [4]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [5]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ··········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [6]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_20201110.json", "df", conf)
AGT.head(5)

Unnamed: 0,filename,author,title,string,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,sentences,lemmata,lemmata_wordcount,lemmatized_sentences,n_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,"[ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν ...","[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, πό...",23803,"[[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, π...",3233
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,\nΘουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶ...,150126,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,[Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",66680,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",6068
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,110773,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,[Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβά...,"[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βαρβαρόομαι, ...",53989,"[[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βαρβαρόομαι,...",10244
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,\n̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα...,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,"[̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...","[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότη...",12000,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότ...",1733
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,[τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ...,"[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλο...",1071,"[[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλ...",137


In [7]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [8]:
subcorpora_dict = {
    "Pagan (8th c. BCE - 4th c. CE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"] < 4)],
    "Pagan (1st - 4th c. CE)" : AGT[(AGT["date_avr"].between(0, 4)) & (AGT["provenience"]=="pagan")],
    "Septuagint" : AGT[AGT["author_id"]=="tlg0527"],
    "Philo of Alexandria" : AGT[AGT["author_id"]=="tlg0018"],
    "Titus Flavius Josephus" : AGT[AGT["author_id"]=="tlg0526"],
    "Paul of Tarsus" : AGT[AGT["author_id"]=="tlg0031paul"],
    "New Testament" : AGT[AGT["author_id"].str.startswith("tlg0031")],
    "Christian (1st - 4th c. CE)" : AGT[(AGT["date_avr"] < 4) & (AGT["provenience"]=="christian")]}

In [9]:
subselections = []
for key in subcorpora_dict.keys():
    subselection = subcorpora_dict[key]
    subselection_dict = {}
    wordcount = subselection["wordcount"].sum()
    doccount = len(subselection)
    sents = get_flat_sentences(subselection["lemmatized_sentences"])
    sents_len = len(sents)
    subselection_dict.update({"label" : key, "wordcount" : wordcount, "doccount" : doccount, "n_sentences": sents_len,"sentences" : sents})
    subselections.append(subselection_dict)
subselections_df = pd.DataFrame(subselections)
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Pagan (8th c. BCE - 4th c. CE),13474895,774,1033208,"[[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, π..."
1,Pagan (1st - 4th c. CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γιγν..."
2,Septuagint,872013,55,53330,"[[εν, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατος,..."
3,Philo of Alexandria,738272,31,82506,"[[νομοθέτης, ἀκαλλώπιστος, γυμνάζω, Πάρος, δίκ..."
4,Titus Flavius Josephus,469581,4,19698,"[[ἱστορία, συγγράφω, εἷς, αὐτην, ὁράω, σπουδάζ..."
5,Paul of Tarsus,24066,7,1630,"[[παυλος, δοῦλος, ἰησοῦ, χριστοῦ, κλητός, ἀπόσ..."
6,New Testament,137788,27,8691,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
7,Christian (1st - 4th c. CE),3366042,131,286782,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."


# LSA with  sklearn

In [10]:
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Pagan (8th c. BCE - 4th c. CE),13474895,774,1033208,"[[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, π..."
1,Pagan (1st - 4th c. CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γιγν..."
2,Septuagint,872013,55,53330,"[[εν, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατος,..."
3,Philo of Alexandria,738272,31,82506,"[[νομοθέτης, ἀκαλλώπιστος, γυμνάζω, Πάρος, δίκ..."
4,Titus Flavius Josephus,469581,4,19698,"[[ἱστορία, συγγράφω, εἷς, αὐτην, ὁράω, σπουδάζ..."
5,Paul of Tarsus,24066,7,1630,"[[παυλος, δοῦλος, ἰησοῦ, χριστοῦ, κλητός, ἀπόσ..."
6,New Testament,137788,27,8691,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
7,Christian (1st - 4th c. CE),3366042,131,286782,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."


In [11]:
# let's extract first and first/second century texts only
docs = subselections_df.at[5, "sentences"]

In [12]:
len(docs)

1630

In [14]:
vocabulary =  list(set([word for sent in docs for word in sent]))
bow = CountVectorizer(vocabulary=vocabulary)
bow_term2doc = bow.fit_transform([" ".join(doc) for doc in docs]) ### run the model
term2term_bow = (bow_term2doc.T * bow_term2doc)

In [15]:
term2term_bow

<2352x2352 sparse matrix of type '<class 'numpy.longlong'>'
	with 71585 stored elements in Compressed Sparse Column format>

In [19]:
G = nx.from_numpy_matrix(term2term_bow.todense()) # from_pandas_adjacency()

In [23]:
vocab_dict = dict(zip(range(len(vocabulary)), vocabulary))

In [25]:
nx.relabel_nodes(G, vocab_dict)

<networkx.classes.graph.Graph at 0x7fcb6de53080>

In [21]:
G.number_of_nodes()

2352

In [None]:
bow = CountVectorizer(vocabulary=vocabulary)
bow_term2doc = bow.fit_transform([" ".join(doc) for doc in docs]) ### run the model

term2term_bow = (bow_term2doc.T * bow_term2doc)


tfidf =  TfidfVectorizer(vocabulary=vocabulary)
tfidf_term2doc = tfidf.fit_transform([" ".join(doc) for doc in docs]) ### run the model

svd = TruncatedSVD(n_components=150, n_iter=5, random_state=42)
svd_bow = svd.fit(bow_term2doc)

svd = TruncatedSVD(n_components=150, n_iter=5, random_state=42)
svd_tfidf = svd.fit(tfidf_term2doc)
    #term2doc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
lsa_bow = pd.DataFrame(svd_bow.components_, columns=vocabulary)
lsa_tfidf = pd.DataFrame(svd_tfidf.components_, columns=vocabulary)

In [None]:
words = ["δίκαιος", "δίκη", "δικαιοσύνη"]

In [None]:
term2term_bow_df = pd.DataFrame(term2term_bow.todense(), columns=vocabulary, index=vocabulary)
term2term_bow_df

Unnamed: 0,πρᾶγμα,ἡσσάομαι,θλίβω,θλίψιν,διδαχῆς,ψάλλω,ἐλάχιστόν,διατάσσω,ποῦ,θερίζω,...,εἴδωλόν,φοβέω,θαυμάζω,ὀδυρμός,εὐαγγέλιον,ὑπακοην,ἀντικειμένωνἥτις,ἔνδειξις,ὑποτάσσω,συγχαίρω
πρᾶγμα,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ἡσσάομαι,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
θλίβω,0,0,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
θλίψιν,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
διδαχῆς,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ὑπακοην,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,4,0,0,0,0
ἀντικειμένωνἥτις,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,1,0,0
ἔνδειξις,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,6,0,0
ὑποτάσσω,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0


In [None]:
sorted_df = term2term_bow_df.sort_values("δικαιοσύνη", ascending=False)[["δικαιοσύνη"]]
print(sorted_df.head(20))

            δικαιοσύνη
δικαιοσύνη          62
πίστις              25
θεάομαι             18
νόμος               16
λογίζομαι           11
χριστοῦ             10
ἁμαρτία             10
ἰησοῦ               10
θεός                 9
πᾶς                  9
πιστεύω              7
περιτομή             7
ἀκροβυστία           6
πατήρ                6
ἀβραάμ               6
ἀνομία               5
βασιλεύω             5
δοῦλος               5
κύριος               5
ἁμάρτημα             5


In [None]:
sorted_df = term2term_bow_df.sort_values("δίκαιος", ascending=False)[["δίκαιος"]]
print(sorted_df.head(20))

            δίκαιος
δίκαιος          14
πίστις            8
ὅσος              6
θεός              6
νόμος             5
πᾶς               4
θεάομαι           4
ἕνος              3
δικαιοσύνη        3
ἁμός              3
ζάω               3
δικαιόω           2
ἀλλ               2
ποιέω             2
πολύς             2
ἅγιος             2
καθίστημι         2
ἔχω               2
γράφω             2
εἶμι              2


In [None]:
lsa_model_data

Unnamed: 0,πρᾶγμα,ἡσσάομαι,θλίβω,θλίψιν,διδαχῆς,ψάλλω,ἐλάχιστόν,διατάσσω,ποῦ,θερίζω,...,εἴδωλόν,φοβέω,θαυμάζω,ὀδυρμός,εὐαγγέλιον,ὑπακοην,ἀντικειμένωνἥτις,ἔνδειξις,ὑποτάσσω,συγχαίρω
0,0.003708,0.000052,0.018253,0.000661,0.000759,0.000499,0.000129,0.001352,0.000011,0.001212,...,-6.639306e-16,0.006197,0.000482,0.000551,0.067805,0.012405,0.001671,0.005543,0.014762,0.001498
1,-0.001711,-0.000088,0.019553,-0.001290,-0.000133,-0.000006,-0.000068,-0.001622,0.000023,0.000156,...,-8.387424e-15,-0.003687,0.000134,-0.000948,-0.018718,-0.002153,0.000014,0.001900,-0.017855,-0.003475
2,-0.001938,0.000137,-0.002422,0.002927,0.000159,0.000678,0.000282,-0.000731,0.000011,0.001485,...,2.157429e-14,0.002290,0.000561,0.002798,0.067655,0.000908,0.002165,0.003588,-0.019197,-0.003370
3,-0.000222,-0.000004,-0.019747,-0.001763,0.000364,0.001681,-0.000111,0.000080,-0.000030,0.004362,...,-1.263571e-13,-0.012455,0.000969,-0.001761,-0.010202,0.017871,0.004209,0.011782,0.010564,-0.000816
4,-0.000502,-0.000153,0.002632,-0.000557,0.003583,0.000096,-0.000091,-0.001782,-0.000024,0.000647,...,3.878321e-13,0.000985,-0.000747,-0.000634,-0.038459,-0.009959,-0.001097,0.004457,-0.003471,-0.000706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,-0.010291,0.003474,-0.049004,-0.001600,-0.011729,0.012179,-0.004205,-0.002103,0.003453,-0.016910,...,-8.785520e-07,0.014278,-0.002597,-0.005485,-0.006336,0.026199,-0.013394,-0.005216,0.001714,-0.009270
146,-0.028570,0.007097,0.005474,-0.006091,0.003053,0.004164,0.001158,0.001073,-0.001872,-0.020473,...,-7.591400e-08,-0.000330,0.004361,0.010873,-0.017895,0.000099,-0.006938,-0.027157,0.002984,-0.024311
147,-0.005424,-0.004964,0.012694,-0.006861,-0.005116,0.001409,0.001862,0.009463,0.000358,0.006379,...,-5.241422e-07,0.028197,0.001656,-0.004227,-0.005124,-0.052964,0.022026,0.048630,0.046711,0.013198
148,0.021749,-0.000103,0.003712,-0.005903,-0.006872,-0.016247,0.007232,0.009815,-0.003137,-0.054888,...,-5.550533e-07,0.046293,0.007101,0.002207,-0.021809,-0.019193,-0.008599,0.001844,-0.025452,-0.006316


In [None]:
doc2term_df = pd.DataFrame(bow_doc2term.toarray(), columns=vocabulary)
doc2term_df.head()

Unnamed: 0,πρᾶγμα,ἡσσάομαι,θλίβω,θλίψιν,διδαχῆς,ψάλλω,ἐλάχιστόν,διατάσσω,ποῦ,θερίζω,...,εἴδωλόν,φοβέω,θαυμάζω,ὀδυρμός,εὐαγγέλιον,ὑπακοην,ἀντικειμένωνἥτις,ἔνδειξις,ὑποτάσσω,συγχαίρω
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def lsa_model(docs):
    vocabulary =  list(set([word for doc in docs for word in doc]))
    tfidf =  TfidfVectorizer(vocabulary=vocabulary) ### initiaze the model
    X = tfidf.fit_transform([" ".join(doc) for doc in docs]) ### run the model
    Xc = (X.T * X)
    svd = TruncatedSVD(n_components=50, n_iter=5, random_state=42)
    svd.fit(Xc)
    #term2doc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
    lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())
    return lsa_model_data, Xc#term2doc

In [None]:
model, term2doc = lsa_model(christian_docs)

In [None]:
len(model.columns)

117515

In [None]:
def get_most_similar(model_df, target_term, number):
  all_similar = []
  for term in model_df.columns:
    similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
    all_similar.append(similarity)
  return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

In [None]:
words = ["δίκαιος", "δίκη", "δικαιοσύνη"]

In [None]:
neighbours = {}  
for word in words:
    neighbours[word] = get_most_similar(model, word, 10)

In [None]:
pd.DataFrame(neighbours)

Unnamed: 0,δίκαιος,δίκη,δικαιοσύνη
0,"(ιειμένην, 0.9834371832818561)","(μετελεύσονται, 0.9384522490347709)","(καθυπηρετεῖ, 0.9710013738851778)"
1,"(εὐστοχίου, 0.9830720889621722)","(ἐπικουροί, 0.9384522490347709)","(ἐνδύσονται, 0.9706819000158255)"
2,"(λογιωτάτου, 0.9830720889621722)","(ἐριννύες, 0.9384522490347709)","(σιοί, 0.9706819000158255)"
3,"(λελύπηται, 0.9830720889621722)","(καταπεπηγμένων, 0.9344969432505092)","(περικτᾶται, 0.9699396541878471)"
4,"(τηρούσης, 0.9827599613491105)","(ἐξέτισας, 0.9182233983592916)","(ἡδυπαθείᾳ, 0.9699396541878471)"
5,"(μιμείσθω, 0.9826270670854331)","(γλωσσαργία, 0.8999555833389017)","(ἅνευ, 0.9695269024725653)"
6,"(ἰὼδ, 0.9822301096106603)","(κρυσταλλοειδεῖ, 0.8997426356399815)","(χρωννύντες, 0.9695269024725653)"
7,"(ἐπαράντων, 0.9820947343036778)","(διώκτας, 0.8970726088109655)","(πρυπορεύσεται, 0.9662484878532644)"
8,"(πειλθαρχεῖν, 0.9820947343036778)","(προςώποις, 0.8861867911971005)","(ἐπιφαινέσθω, 0.9660042345965665)"
