In [54]:
### PREREQUISTIES
### (many used only in one notebook...)

import os
import pandas as pd
import numpy as np
import logging ### to monitor the code
from bs4 import BeautifulSoup
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
import re

import nltk
from nltk.collocations import *

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import xml.etree.cElementTree as ET
from urllib.request import urlopen

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
from google.oauth2 import service_account # based on google-auth library
import sddk

In [55]:
# gensim parts
from gensim import corpora
from gensim import models

### lsa alternative
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [56]:
#!pip install anda
#from anda import gr ### the import takes substantial time, since it import a +600MB file containing ancient Greek dictionary

In [57]:
### not neccessary for reading the data, just for exporting them to sciencedata.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [58]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

ECCE_AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1KPpPaeX215HR_fVrakvJp8aB6oZDhHFTcBw0MKLw6as/edit?usp=sharing")

# Import and preprocess corpus

In [60]:
AGT = sddk.read_file("SDAM_data/AGT/AGT_20201110.json", "df", conf)
AGT.head(5)

Unnamed: 0,filename,author,title,string,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,sentences,lemmata,lemmata_wordcount,lemmatized_sentences,n_sentences
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",38822,tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{'-2.5': 1},-2.5,pagan,"[ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν ...","[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, πό...",23803,"[[Φοῖβος, παλαιγενής, κλέος, φάος, μιμνήσκω, π...",3233
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,\nΘουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶ...,150126,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,pagan,[Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν...,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",66680,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",6068
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers,Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,110773,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{'2.5': 1},,,[Τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβά...,"[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βαρβαρόομαι, ...",53989,"[[φιλοσοφία, ἔργον, ἔνιοί, φάσις, βαρβαρόομαι,...",10244
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls,\n̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα...,19200,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,"[̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ πίτυς αἰπόλε τήνα,...","[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότη...",12000,"[[αδύ, ψιθύρισμα, πίτυς, αἰπόλος, ἐκεῖνος, πότ...",1733
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams,τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ἕ...,1734,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{'-3.5': 0.5, '-2.5': 0.5}",,,[τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυκνος ἐκείνα ...,"[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλο...",1071,"[[ῥοδῆ, δροσόεντα, κατάπυκνος, ἐκεῖνος, ἕρπυλλ...",137


In [61]:
subcorpora_dict = {# main subcorpora
    "Archaic (8-6 BCE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"] <= -5.5)],
    "Classical (5-4 BCE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"].between(-5, -3))],
    "Non-Christian (1-4 CE)" : AGT[(AGT["date_avr"].between(0, 4)) & (AGT["provenience"]=="pagan")],
    "Septuagint" : AGT[AGT["author_id"]=="tlg0527"],
    "Christian (1-4 CE)" : AGT[(AGT["date_avr"] < 4) & (AGT["provenience"]=="christian")],
    "Christian (1-1/2 CE)" : AGT[(AGT["date_avr"] < 1.5) & (AGT["provenience"]=="christian")],
    "Christian (2-3/4 CE)" : AGT[(AGT["date_avr"].between(1.5, 3)) & (AGT["provenience"]=="christian")],
    "Christian (4-4/5 CE)" : AGT[(AGT["date_avr"].between(3.5, 4)) & (AGT["provenience"]=="christian")]}

subcorpora_dict_other = {
    "Pagan (8th c. BCE - 4th c. CE)" : AGT[(AGT["provenience"]=="pagan") & (AGT["date_avr"] < 4)],
    "Pagan (1st - 4th c. CE)" : AGT[(AGT["date_avr"].between(0, 4)) & (AGT["provenience"]=="pagan")],
    "Jewish" : AGT[AGT["author_id"].isin(["tlg0527", "tlg0018", "tlg0526"])],
    "Septuagint" : AGT[AGT["author_id"]=="tlg0527"],
    "Philo of Alexandria" : AGT[AGT["author_id"]=="tlg0018"],
    "Titus Flavius Josephus" : AGT[AGT["author_id"]=="tlg0526"],
    "Paul of Tarsus" : AGT[AGT["author_id"]=="tlg0031paul"],
    "New Testament" : AGT[AGT["author_id"].str.startswith("tlg0031")],
    "Christian (1st - 4th c. CE)" : AGT[(AGT["date_avr"] < 4) & (AGT["provenience"]=="christian")]}    


In [62]:
def get_flat_sentences(series):
    sentences_list = [sent for doc in series.tolist() for sent in doc]
    return sentences_list

In [63]:
subselections = []
for key in subcorpora_dict.keys():
    subselection = subcorpora_dict[key]
    subselection_dict = {}
    wordcount = subselection["wordcount"].sum()
    doccount = len(subselection)
    sents = get_flat_sentences(subselection["lemmatized_sentences"])
    sents_len = len(sents)
    subselection_dict.update({"label" : key, "wordcount" : wordcount, "doccount" : doccount, "n_sentences": sents_len,"sentences" : sents})
    subselections.append(subselection_dict)
subselections_df = pd.DataFrame(subselections)
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Archaic (8-6 BCE),264986,11,20444,"[[μῆνις, ἀείδω, θεά, Πηληιάδης, Ἀχιλλεύς, οὐλό..."
1,Classical (5-4 BCE),3439210,338,256926,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,Non-Christian (1-4 CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γιγν..."
3,Septuagint,872013,55,53330,"[[εν, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατος,..."
4,Christian (1-4 CE),3366042,131,286782,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
5,Christian (1-1/2 CE),171389,38,11959,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
6,Christian (2-3/4 CE),1792275,65,165245,"[[ἀμφίων, θηβαῖος, ἀρίων, μηθυμναῖος, ἄμφω, ᾠδ..."
7,Christian (4-4/5 CE),1746266,33,141538,"[[ἐπειδη, ὁράω, κράτιστος, διόγνητε, ὑπερεσπου..."


In [64]:
AGT_docs = get_flat_sentences(AGT["lemmatized_sentences"])

In [65]:
print(AGT_docs[:10])

[['Φοῖβος', 'παλαιγενής', 'κλέος', 'φάος', 'μιμνήσκω', 'πόντοιο', 'στόμα', 'πέτρα', 'κυανέας', 'βασιλεύς', 'ἐφημοσύνη', 'πελίαο', 'χρυσεῖον', 'κῶας', 'εὔζυγος', 'ἐλαύνω', 'ἀργώ'], ['τοῖος', 'πελίης', 'φάτις', 'ἐκλύω', 'μοῖρα', 'μένος', 'στυγερός', 'τοῦδ', 'ἀνήρ', 'ὅντιν', 'εἶδον', 'οἰοπέδιλον', 'ὑπ', 'ἐνεσία', 'δαμάζω'], ['δηρός', 'τεός', 'βάξις', 'ἰήσων', 'χειμερίζω', 'ῥεῖθρον', 'κίων', 'πόσις', 'ἀναύρου', 'ἐκσῴζω', 'ὑπ', 'ἰλύς', 'καταλιμπάνω', 'πέδιλον', 'προχόη'], ['ἱκνέομαι', 'πελίην', 'ἀντιβολέω', 'εἰλαπινάζω', 'πατήρ', 'Ποσειδεών', 'ῥέζω', 'θεός', 'Ἥρα', 'πελασγίδος', 'ἀλεγίζω'], ['τόνγ', 'φράζω', 'ἆθλον', 'ἔντυε', 'ναυτιλία', 'πολυκηδής', 'ὄφρ', 'πόντος', 'ἀλλοδαπός', 'ἔμετος', 'ἀνήρ', 'νόστος', 'ὄλλυμι'], ['ναῦς', 'ἐπικλείω', 'ἀοιδός', 'ἄργον', 'Ἀθήνη', 'κάμνω', 'ὑποθημοσύνη'], ['γενεά', 'ὀνομάζω', 'μυθέομαι', 'ἥρως', 'δολιχός', 'πόρος', 'ἅλς', 'ὅσος', 'ἐρέσσω'], ['Μοῦσα', 'ὑποφήτωρ', 'ἀοιδή'], ['πρῶτά', 'ὀρφῆος', 'μιμνήσκω', 'ῥαίνω', 'ποτάομαι', 'αὐτη', 'Καλλιόπη', 'θρήικι', '

# Exploring gensim dictionary

In [66]:
#create gensim dictionary for our list of sentences
dictionary = corpora.Dictionary(AGT_docs)

In [67]:
# as such, it is organized by ids
dict(list(dictionary.items())[:10])

{0: 'Φοῖβος',
 1: 'βασιλεύς',
 2: 'εὔζυγος',
 3: 'κλέος',
 4: 'κυανέας',
 5: 'κῶας',
 6: 'μιμνήσκω',
 7: 'πέτρα',
 8: 'παλαιγενής',
 9: 'πελίαο'}

In [68]:
dict(list(dictionary.token2id.items())[:10])

{'Φοῖβος': 0,
 'βασιλεύς': 1,
 'εὔζυγος': 2,
 'κλέος': 3,
 'κυανέας': 4,
 'κῶας': 5,
 'μιμνήσκω': 6,
 'πέτρα': 7,
 'παλαιγενής': 8,
 'πελίαο': 9}

In [69]:
# look at one sentence
AGT_docs[10000]

['Βίας', 'τευτάμου', 'πριηνεύς', 'προκεκριμένος', 'σατύρου']

In [70]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(AGT_docs[10000])

[(399, 1), (14001, 1), (14002, 1), (14003, 1), (14004, 1)]

In [71]:
# document to id method
dictionary.doc2idx(AGT_docs[10000])

[399, 14004, 14001, 14002, 14003]

In [72]:
sent = docs[20]
sent

['ἐξέρχομαι', 'ποιμανεῖ', 'λαός', 'ἁμός', 'ἰσραήλ']

In [73]:
# doc2bow = document to (term, tf) tuples, i.e. bag-of-words
dictionary.doc2bow(sent)

[(824, 1), (1827, 1), (4059, 1), (113264, 1), (153257, 1)]

# LSA with  sklearn

In [74]:
subselections_df

Unnamed: 0,label,wordcount,doccount,n_sentences,sentences
0,Archaic (8-6 BCE),264986,11,20444,"[[μῆνις, ἀείδω, θεά, Πηληιάδης, Ἀχιλλεύς, οὐλό..."
1,Classical (5-4 BCE),3439210,338,256926,"[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ..."
2,Non-Christian (1-4 CE),9172125,411,718144,"[[γεωγραφία, σόσσιε, σενεκίων, ἱστορικός, γιγν..."
3,Septuagint,872013,55,53330,"[[εν, ποιέω, θεός, οὐρανός, γῆ], [γῆ, ἀόρατος,..."
4,Christian (1-4 CE),3366042,131,286782,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
5,Christian (1-1/2 CE),171389,38,11959,"[[βιβλος, γένεσις, ἰησοῦ, χριστοῦ, υἱός, δαυεί..."
6,Christian (2-3/4 CE),1792275,65,165245,"[[ἀμφίων, θηβαῖος, ἀρίων, μηθυμναῖος, ἄμφω, ᾠδ..."
7,Christian (4-4/5 CE),1746266,33,141538,"[[ἐπειδη, ὁράω, κράτιστος, διόγνητε, ὑπερεσπου..."


In [75]:
# let's extract first and first/second century texts only
docs = subselections_df.at[5, "sentences"]

In [76]:
len(docs)

11959

In [77]:
[word for sent in docs for word in sent].count("ἀβρααμ")

1

In [78]:
# a smaller dictionary for our subcorpus
dictionary = corpora.Dictionary(docs)

In [None]:
morpheus_dict[χριστοῦ

In [79]:
freq_dists = []
for dict_id in dictionary.keys():
    freq_dists.append((dictionary[dict_id], dictionary.cfs[dict_id]))
freq_dists = sorted(freq_dists, key=lambda tup: tup[1], reverse=True)
freq_dists

[('πᾶς', 1356),
 ('ἁμός', 1345),
 ('λέγω', 971),
 ('θεάομαι', 927),
 ('εἶπον', 873),
 ('ἡμός', 860),
 ('θεός', 782),
 ('ποιέω', 602),
 ('σεύω', 599),
 ('ἄνθρωπος', 582),
 ('γίγνομαι', 568),
 ('κύριος', 566),
 ('ἔχω', 532),
 ('ἰησοῦς', 492),
 ('πατήρ', 480),
 ('εἶμι', 478),
 ('ἰησοῦ', 451),
 ('ἔρχομαι', 448),
 ('ἡμέρα', 429),
 ('πνεῦμα', 428),
 ('υἱός', 426),
 ('εἶδον', 404),
 ('πολύς', 388),
 ('λόγος', 374),
 ('δίδωμι', 367),
 ('ἀδελφός', 357),
 ('γῆ', 311),
 ('ἐκεῖνος', 295),
 ('οὐρανός', 293),
 ('πίστις', 288),
 ('ἀλλ', 285),
 ('χριστοῦ', 280),
 ('μέγας', 279),
 ('δοκέω', 275),
 ('μαθητής', 271),
 ('ἀκούω', 269),
 ('οἶδα', 269),
 ('λαλέω', 260),
 ('χριστός', 254),
 ('ἐμέω', 252),
 ('ἵημι', 244),
 ('ἅγιος', 241),
 ('γιγνώσκω', 233),
 ('δύναμαι', 216),
 ('καρδία', 211),
 ('ἔργον', 208),
 ('ἐθέλω', 207),
 ('πιστεύω', 206),
 ('γυνή', 202),
 ('χάρις', 200),
 ('νόμος', 198),
 ('γράφω', 193),
 ('λαμβάνω', 190),
 ('ἔθνος', 188),
 ('ἀγαπάω', 185),
 ('ἄατος', 182),
 ('βασίλεια', 182),
 ('εὑρίσ

In [49]:
freq_dists = []
for dict_id in dictionary.keys():
    freq_dists.append((dictionary[dict_id], dictionary.cfs[dict_id]))
freq_dists = sorted(freq_dists, key=lambda tup: tup[1], reverse=True)
freq_dists

[('θεός', 1697),
 ('πᾶς', 1537),
 ('ἁμός', 1348),
 ('Ἰησοῦς', 1104),
 ('λέγω', 994),
 ('κύριος', 942),
 ('εἶπον', 909),
 ('ἡμός', 860),
 ('χριστός', 738),
 ('ὑμῖν', 716),
 ('γίγνομαι', 614),
 ('ἄνθρωπος', 588),
 ('εἶμι', 547),
 ('ἔχω', 536),
 ('ποιέω', 532),
 ('πατήρ', 498),
 ('ἔρχομαι', 467),
 ('ἡμέρα', 436),
 ('πνεῦμα', 428),
 ('υἱός', 416),
 ('λόγος', 374),
 ('δίδωμι', 368),
 ('ἀδελφός', 341),
 ('πολύς', 336),
 ('γῆ', 312),
 ('πίστις', 306),
 ('ἐκεῖνος', 295),
 ('ἀκούω', 294),
 ('οὐρανός', 289),
 ('μέγας', 284),
 ('εἶδον', 283),
 ('ἅγιος', 273),
 ('μαθητής', 271),
 ('ὑμεῖς', 261),
 ('ἀγάπη', 249),
 ('ἵημι', 243),
 ('λαλέω', 242),
 ('αὐτοὺς', 235),
 ('Ἰουδαῖος', 234),
 ('ὄνομα', 230),
 ('σάρξ', 226),
 ('κόσμος', 222),
 ('οἶδα', 221),
 ('ἔργον', 218),
 ('ἐθέλω', 212),
 ('καρδία', 211),
 ('πιστεύω', 211),
 ('δόξα', 209),
 ('γυνή', 208),
 ('ἀνήρ', 205),
 ('ἰδοὺ', 204),
 ('γράφω', 204),
 ('χάρις', 204),
 ('γιγνώσκω', 203),
 ('λαμβάνω', 201),
 ('νόμος', 198),
 ('καθὼς', 190),
 ('ὃν', 189)

In [37]:
dictionary.cfs[6]

1104

In [35]:
list(dictionary.cfs)

[0,
 1,
 6,
 4,
 3,
 2,
 5,
 17,
 9,
 28,
 23,
 24,
 26,
 18,
 27,
 16,
 10,
 11,
 21,
 22,
 20,
 19,
 12,
 13,
 14,
 15,
 8,
 30,
 29,
 31,
 25,
 7,
 38,
 36,
 37,
 54,
 39,
 41,
 49,
 48,
 52,
 53,
 47,
 43,
 44,
 45,
 33,
 34,
 42,
 40,
 50,
 51,
 46,
 35,
 32,
 72,
 60,
 61,
 55,
 62,
 63,
 69,
 64,
 65,
 58,
 59,
 67,
 70,
 71,
 68,
 56,
 73,
 66,
 57,
 76,
 74,
 77,
 75,
 78,
 82,
 86,
 84,
 79,
 81,
 80,
 83,
 85,
 91,
 89,
 93,
 87,
 90,
 88,
 92,
 103,
 102,
 96,
 95,
 104,
 100,
 101,
 98,
 97,
 94,
 99,
 108,
 105,
 110,
 107,
 106,
 109,
 118,
 111,
 113,
 116,
 114,
 112,
 117,
 115,
 121,
 120,
 122,
 119,
 126,
 124,
 125,
 123,
 127,
 128,
 130,
 136,
 133,
 134,
 132,
 131,
 135,
 129,
 137,
 139,
 140,
 138,
 147,
 149,
 146,
 142,
 145,
 148,
 141,
 144,
 143,
 150,
 151,
 152,
 157,
 154,
 155,
 153,
 156,
 159,
 158,
 160,
 164,
 163,
 161,
 165,
 162,
 167,
 166,
 168,
 172,
 170,
 174,
 169,
 173,
 171,
 177,
 178,
 175,
 176,
 182,
 180,
 183,
 179,
 185,
 181,


In [18]:
[value for value in dictionary.values()]

['βίβλος',
 'γένεσις',
 'δαυείδ',
 'υἱός',
 'χριστός',
 'ἀβρααμ',
 'Ἰησοῦς',
 'βασιλεύς',
 'βοές',
 'γεννάω',
 'ζαρά',
 'θάμαρ',
 'ναασσών',
 'ναασσὼν',
 'σαλμών',
 'σαλμὼν',
 'φαρές',
 'ἀβραάμ',
 'ἀδελφοὺς',
 'ἀμιναδάβ',
 'ἀράμ',
 'ἑσρώμ',
 'ἑσρὼμ',
 'ἰακώβ',
 'ἰακὼβ',
 'ἰεσσαί',
 'ἰούδαν',
 'ἰούδας',
 'ἰσαάκ',
 'ἰωβηδ',
 'ῥαχάβ',
 'ῥούθ',
 'βαβυλῶνος',
 'μανασσῆ',
 'μανασσῆς',
 'μετοικεσία',
 'οὔριος',
 'σολομὼν',
 'σολομῶνα',
 'ἀβιά',
 'ἀμὼς',
 'ἀσάφ',
 'ἁμός',
 'ἄχας',
 'ἑζεκίαν',
 'ἑζεκίας',
 'ἰεχονίαν',
 'ἰωαθάμ',
 'ἰωράμ',
 'ἰωσαφάτ',
 'ἰωσείαν',
 'ἰωσείας',
 'ὀζείαν',
 'ὀζείας',
 'ῥοβοάμ',
 'ζοροβάβελ',
 'μαθθάν',
 'μαρίας',
 'σαδώκ',
 'σαδὼκ',
 'σαλαθιήλ',
 'σαλαθιηλ',
 'ἀβιούδ',
 'ἀβιοὺδ',
 'ἀζώρ',
 'ἀζὼρ',
 'ἀνήρ',
 'ἀχείμ',
 'ἐλεάζαρ',
 'ἐλιακείμ',
 'ἐλιούδ',
 'ἐλιοὺδ',
 'ἰεχονίας',
 'ἰωσηφ',
 'γενεά',
 'δεκατέσσαρες',
 'πᾶς',
 'ἑός',
 'δεῖ',
 'αὐτοὺς',
 'γαστήρ',
 'εὑρίσκω',
 'μήτηρ',
 'πνεῦμα',
 'συνέρχομαι',
 'ἅγιος',
 'ἰωσήφ',
 'αὐτην',
 'βούλομαι',
 'δίκαιος',
 'δειγμα

In [21]:
vocabulary =  list(set([word for doc in christian_docs for word in christian_docs]))

MemoryError: 

In [None]:


bow = CountVectorizer(vocabulary=vocabulary)
bow_doc2term = bow.fit_transform([" ".join(doc) for doc in docs]) ### run the model

tfidf =  TfidfVectorizer(vocabulary=vocabulary)
tfidf_doc2term = tfidf.fit_transform([" ".join(doc) for doc in docs]) ### run the model

#Xc = (X.T * X)
#svd = TruncatedSVD(n_components=50, n_iter=5, random_state=42)
#svd.fit(Xc)
    #term2doc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
#lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())

In [None]:
bow_doc2term.todense()

In [106]:
def lsa_model(docs):
    vocabulary =  list(set([word for doc in docs for word in doc]))
    tfidf =  TfidfVectorizer(vocabulary=vocabulary) ### initiaze the model
    X = tfidf.fit_transform([" ".join(doc) for doc in docs]) ### run the model
    Xc = (X.T * X)
    svd = TruncatedSVD(n_components=50, n_iter=5, random_state=42)
    svd.fit(Xc)
    #term2doc = pd.DataFrame(Xc.toarray(), columns=vec.get_feature_names(), index=vec.get_feature_names())
    lsa_model_data = pd.DataFrame(svd.components_, columns=vec.get_feature_names())
    return lsa_model_data, Xc#term2doc

In [107]:
model, term2doc = lsa_model(christian_docs)

In [108]:
len(model.columns)

117515

In [102]:
def get_most_similar(model_df, target_term, number):
  all_similar = []
  for term in model_df.columns:
    similarity = (term, cosine_similarity([model_df[target_term],  model_df[term]])[0][1])
    all_similar.append(similarity)
  return sorted(all_similar, key=lambda number: number[1], reverse=True)[1:number]

In [103]:
words = ["δίκαιος", "δίκη", "δικαιοσύνη"]

In [109]:
neighbours = {}  
for word in words:
    neighbours[word] = get_most_similar(model, word, 10)

In [110]:
pd.DataFrame(neighbours)

Unnamed: 0,δίκαιος,δίκη,δικαιοσύνη
0,"(ιειμένην, 0.9834371832818561)","(μετελεύσονται, 0.9384522490347709)","(καθυπηρετεῖ, 0.9710013738851778)"
1,"(εὐστοχίου, 0.9830720889621722)","(ἐπικουροί, 0.9384522490347709)","(ἐνδύσονται, 0.9706819000158255)"
2,"(λογιωτάτου, 0.9830720889621722)","(ἐριννύες, 0.9384522490347709)","(σιοί, 0.9706819000158255)"
3,"(λελύπηται, 0.9830720889621722)","(καταπεπηγμένων, 0.9344969432505092)","(περικτᾶται, 0.9699396541878471)"
4,"(τηρούσης, 0.9827599613491105)","(ἐξέτισας, 0.9182233983592916)","(ἡδυπαθείᾳ, 0.9699396541878471)"
5,"(μιμείσθω, 0.9826270670854331)","(γλωσσαργία, 0.8999555833389017)","(ἅνευ, 0.9695269024725653)"
6,"(ἰὼδ, 0.9822301096106603)","(κρυσταλλοειδεῖ, 0.8997426356399815)","(χρωννύντες, 0.9695269024725653)"
7,"(ἐπαράντων, 0.9820947343036778)","(διώκτας, 0.8970726088109655)","(πρυπορεύσεται, 0.9662484878532644)"
8,"(πειλθαρχεῖν, 0.9820947343036778)","(προςώποις, 0.8861867911971005)","(ἐπιφαινέσθω, 0.9660042345965665)"
