In [1]:
import numpy as np
import pandas as pd
import requests
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
from gensim.corpora import Dictionary
import unicodedata
import json

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

file_data = json.load(open("../../ServiceAccountsKey.json", "r"))
# or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

# Load the main dataset of ancient Greek texts

In [4]:
cgl = pd.read_json("../data/large_data/cgl.json")
cgl.head()

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,count_λυπέω,count_λυπηρός,count_λύπη,count_ἄλγος,count_ἄλγημα,count_ἀλγέω,count_ὀδύνη,count_ὀδυνάω,count_πονέω,count_πόνος
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,10,11,4,0,0,2,0,0,10,21
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,1,0,0,0,0,0,0,0,0,7
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,2,0,0,0,0,0,1,10
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,1,5,4,0,0,3,1,1,7
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,4,0,0,5,0,0,1,4


In [5]:
cgl["lemmata"].apply(lambda x: x.count("συζάω")).sum()

62

In [6]:
sents = [sen for work in cgl["lemmatized_sentences_repl"] for sen in work]

In [7]:
len(sents)

243579

In [8]:
min_freq = 50

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary # , vocab_freqs

In [9]:
word_freqs, words, vocabulary = get_vocab(sents)

In [10]:
word_freqs[:10]

[('εἰμί', 79567),
 ('οὗτος', 60113),
 ('αὐτός', 42056),
 ('λέγω', 29114),
 ('τις', 27416),
 ('ἔχω', 22584),
 ('γίγνομαι', 21737),
 ('πολύς', 20652),
 ('πᾶς', 19062),
 ('ἄλλος', 18361)]

In [11]:
len(words)

1822082

In [12]:
len(vocabulary)

3476

In [13]:
print(vocabulary[:20])

['εἰμί', 'οὗτος', 'αὐτός', 'λέγω', 'τις', 'ἔχω', 'γίγνομαι', 'πολύς', 'πᾶς', 'ἄλλος', 'ποιέω', 'φημί', 'οὐδείς', 'τίς', 'δέω', 'λόγος', 'τοιοῦτος', 'πόλις', 'ἀγαθός', 'δοκέω']


In [14]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)

In [15]:
stopwords = [normalize_encoding(w) for w in stopwords] + ["εἰμί", "οὗτος", "αὐτός", "αὐτός", "τις", "τίς"]

In [16]:
vocabulary = [w for w in vocabulary if w not in stopwords]
vocabulary[:10]

['λέγω',
 'ἔχω',
 'γίγνομαι',
 'πολύς',
 'πᾶς',
 'ποιέω',
 'φημί',
 'δέω',
 'λόγος',
 'πόλις']

In [17]:
len(vocabulary)

3462

In [18]:
# continuous bigrams & trigrams, within sentences only
sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]

sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]

In [19]:
ngrams_data = sents + sents_bigrams + sents_trigrams
len(ngrams_data)

3220729

In [20]:
author_ids = list(set(cgl["author_id"]))
author_ids[:5]

['tlg0610', 'tlg0058', 'tlg0591', 'tlg0027', 'tlg0006']

In [21]:
with open("../data/large_data/corpus_ngrams.txt", "w", encoding="utf-8") as f:

    f.writelines("\n".join([" ".join(ngram) for ngram in ngrams_data]))

In [22]:
pickle.dump(vocabulary, open("../data/vocabulary.pickle", "wb"))

In [23]:
pickle.dump(word_freqs, open("../data/word_freqs_all.pickle", "wb"))

In [24]:
word_freqs_vocabulary = pd.DataFrame([tup for tup in word_freqs if tup[0] in vocabulary], columns=["word", "count"])

In [25]:
word_freqs_vocabulary.to_csv("../data/word_freqs_vocabulary.csv", index=False)

# Ngrams by doc ids

In [21]:
doc_ids = list(set(cgl["doc_id"]))
len(doc_ids)

371

In [26]:
cgl_subset = cgl[cgl["doc_id"]=="tlg0086.tlg010"]
sents = [[sen for work in cgl_subset["lemmatized_sentences_repl"] for sen in work][0]]
sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]
sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
ngrams_data = sents + sents_bigrams + sents_trigrams

In [34]:
" ".join(sents[0])

'πᾶς τέχνη πᾶς μέθοδος πρᾶξις προαίρεσις ἀγαθός τις ἐφίημι δοκέω'

In [36]:
[" ".join(bigram) for bigram in sents_bigrams]

['πᾶς τέχνη',
 'τέχνη πᾶς',
 'πᾶς μέθοδος',
 'μέθοδος πρᾶξις',
 'πρᾶξις προαίρεσις',
 'προαίρεσις ἀγαθός',
 'ἀγαθός τις',
 'τις ἐφίημι',
 'ἐφίημι δοκέω']

In [29]:
sents_trigrams

[['πᾶς', 'τέχνη', 'πᾶς'],
 ['τέχνη', 'πᾶς', 'μέθοδος'],
 ['πᾶς', 'μέθοδος', 'πρᾶξις'],
 ['μέθοδος', 'πρᾶξις', 'προαίρεσις'],
 ['πρᾶξις', 'προαίρεσις', 'ἀγαθός'],
 ['προαίρεσις', 'ἀγαθός', 'τις'],
 ['ἀγαθός', 'τις', 'ἐφίημι'],
 ['τις', 'ἐφίημι', 'δοκέω']]

In [32]:
cgl_subset["clean_string"].tolist()[0][:200]

'πᾶσα τέχνη καὶ πᾶσα μέθοδος, ὁμοίως δὲ πρᾶξίς τε καὶ προαίρεσις, ἀγαθοῦ τινὸς ἐφίεσθαι δοκεῖ· διὸ καλῶς ἀπεφήναντο τἀγαθόν, οὗ πάντʼ ἐφίεται. διαφορὰ δέ τις φαίνεται τῶν τελῶν· τὰ μὲν γάρ εἰσιν ἐνέργε'

In [27]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    cgl_subset = cgl[cgl["doc_id"]==doc_id]
    sents = [sen for work in cgl_subset["lemmatized_sentences_repl"] for sen in work]
    sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_bigrams + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [28]:
ids_lines

{'tlg0086.tlg002': (0, 17412),
 'tlg0627.tlg017': (17412, 29695),
 'tlg0086.tlg022': (29695, 51922),
 'tlg0610.perseu': (51922, 53930),
 'tlg0006.tlg004': (53930, 59767),
 'tlg0086.tlg016': (59767, 61837),
 'tlg0014.tlg012': (61837, 63147),
 'tlg0010.tlg005': (63147, 66440),
 'tlg0026.tlg002': (66440, 78039),
 'tlg0086.tlg020': (78039, 79424),
 'tlg0066.tlg001': (79424, 81919),
 'tlg0059.tlg025': (81919, 89009),
 'tlg0059.tlg016': (89009, 91108),
 'tlg0017.tlg011': (91108, 94542),
 'tlg0199.tlg001': (94542, 98962),
 'tlg0086.tlg024': (98962, 100855),
 'tlg0014.tlg053': (100855, 102777),
 'tlg0006.tlg015': (102777, 113986),
 'tlg0093.tlg010': (113986, 125710),
 'tlg0032.tlg008': (125710, 131206),
 'tlg1210.tlg002': (131206, 139970),
 'tlg0006.tlg019': (139970, 146011),
 'tlg0017.tlg012': (146011, 146917),
 'tlg0093.ogl001': (146917, 151286),
 'tlg0014.tlg039': (151286, 153903),
 'tlg0030.tlg001': (153903, 155191),
 'tlg0627.tlg015': (155191, 157309),
 'tlg0085.tlg005': (157309, 166777),

In [29]:
pickle.dump(ids_lines, open("../data/ids_lines.pickle", "wb"))

# Testing that the indexing works

In [30]:
with open("../data/large_data/corpus_ngrams_bydocid.txt", "r") as f:
    lines = f.readlines()
    doc_vals = ids_lines["tlg0086.tlg014"]
    for line in lines[doc_vals[0]:doc_vals[1]][:10]:
        if bool(line.split()):
            print(line.split())

['ζῷον', 'μόριον', 'εἰμί', 'ἀσύνθετος', 'διαιρέω', 'ὁμοιομερής', 'σάρξ', 'σάρξ', 'σύνθετος', 'ἀνομοιομερής', 'χείρ', 'χείρ', 'διαιρέω', 'πρόσωπον', 'πρόσωπον']
['τοιοῦτος', 'ἔνιοι', 'μέρος', 'μέλος', 'καλέω']
['τοιοῦτος', 'εἰμί', 'μέρος', 'ὅλος', 'εἰμί', 'ἕτερος', 'μέρος', 'ἔχω', 'κεφαλή', 'σκέλος', 'χείρ', 'ὅλος', 'βραχίων', 'θώραξ']
['οὗτος', 'αὐτός', 'εἰμί', 'μέρος', 'ὅλος', 'εἰμί', 'αὐτός', 'ἕτερος', 'μόριον']
['πᾶς', 'ἀνομοιομερής', 'σύγκειμαι', 'ὁμοιομερής', 'χείρ', 'σάρξ', 'νεῦρον', 'ὀστέον']
['ἔχω', 'ζῷον', 'ἔνιοι', 'πᾶς', 'μόριος', 'αὐτός', 'ἔνιοι', 'ἕτερος']
['αὐτός', 'εἶδος', 'μόριον', 'εἰμί', 'ἄνθρωπος', 'ῥίς', 'ὀφθαλμός', 'ἄνθρωπος', 'ῥίς', 'ὀφθαλμός', 'σάρξ', 'σάρξ', 'ὀστέον', 'ὀστέον']
['αὐτός', 'τρόπος', 'ἵππος', 'ἄλλος', 'ζῷον', 'εἶδος', 'αὐτός', 'λέγω']
['ὅλος', 'ἔχω', 'ὅλος', 'μόριος', 'ἔχω', 'ἕκαστος', 'ἕκαστος']
['αὐτός', 'εἰμί', 'διαφέρω', 'ὑπεροχή', 'ἔλλειψις', 'γένος', 'εἰμί', 'αὐτός']


In [31]:
cgl[cgl["doc_id"]=="tlg0086.tlg014"]["lemmatized_sentences"]

694    [[ζῷον, μόριον, εἰμί, ἀσύνθετος, διαιρέω, ὁμοι...
Name: lemmatized_sentences, dtype: object

In [32]:
c_aristotelicum_ids = cgl[cgl["subcorpus"]=="c_aristotelicum"]["doc_id"].tolist()
c_aristotelicum_ids[:5]

['tlg0086.tlg001',
 'tlg0086.tlg002',
 'tlg0086.tlg005',
 'tlg0086.tlg006',
 'tlg0086.tlg008']

In [33]:
ids_list = c_aristotelicum_ids[:5]

In [34]:
class NgramCorpus:
    def __iter__(self):
        with open("../data/large_data/corpus_ngrams_bydocid.txt", "r") as f:
            lines = f.readlines()
            for phi_id in ids_list:
                for line in lines[ids_lines[phi_id][0] : ids_lines[phi_id][1]]:
                    if bool(line.split()):
                        yield line.split()

In [35]:
corpus = NgramCorpus()
[doc for doc in corpus][:5]

[['πρῶτος',
  'λέγω',
  'τίς',
  'τίς',
  'εἰμί',
  'σκέψις',
  'ἀπόδειξις',
  'ἐπιστήμη',
  'ἀποδεικτικός'],
 ['διορίζω',
  'τίς',
  'εἰμί',
  'πρότασις',
  'τίς',
  'ὅρος',
  'τίς',
  'συλλογισμός',
  'ποῖος',
  'τέλειος',
  'ποῖος',
  'ἀτελής',
  'οὗτος',
  'τίς',
  'ὅλος',
  'εἰμί',
  'εἰμί',
  'ὅδε',
  'ὅδε',
  'τίς',
  'λέγω',
  'πᾶς',
  'μηδείς',
  'κατηγορέω'],
 ['πρότασις', 'εἰμί', 'λόγος', 'καταφατικός', 'ἀποφατικός', 'τις', 'τις'],
 ['οὗτος', 'μέρος', 'ἀδιόριστος'],
 ['λέγω',
  'πᾶς',
  'μηδείς',
  'ὑπάρχω',
  'μέρος',
  'τις',
  'τις',
  'πᾶς',
  'ὑπάρχω',
  'ἀδιόριστος',
  'ὑπάρχω',
  'ὑπάρχω',
  'μέρος',
  'ἐναντίος',
  'εἰμί',
  'αὐτός',
  'ἐπιστήμη',
  'ἡδονή',
  'εἰμί',
  'ἀγαθός']]

In [36]:
dct = Dictionary([vocabulary])

In [37]:
test_doc = [doc for doc in corpus][0]
print(test_doc)

['πρῶτος', 'λέγω', 'τίς', 'τίς', 'εἰμί', 'σκέψις', 'ἀπόδειξις', 'ἐπιστήμη', 'ἀποδεικτικός']


In [38]:
dct.doc2bow(test_doc)

[(992, 1), (1663, 1), (1742, 1), (2530, 1), (2948, 1)]

In [39]:
class NgramCorpus(object):
    def __init__(self, ids_list, ids_lines, fname, bow=False, dct=None):
        self.ids_list = ids_list
        self.fname = fname
        self.bow  = bow
        self.dct = dct
        self.ids_lines = ids_lines
        self.len = 0
    def __len__(self):
        [el for el in self]
        return self.len
    def __iter__(self):
        with open(self.fname, "r") as f:
            lines = f.readlines()
            for id in self.ids_list:
                for line in lines[self.ids_lines[id][0] : self.ids_lines[id][1]]:
                    if bool(line.split()):
                        self.len += 1
                        if (self.bow) & (self.dct != None):
                            yield self.dct.doc2bow(line.split())
                        else:
                            yield line.split()

In [40]:
corpus = NgramCorpus(c_aristotelicum_ids[:4], ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt", bow=True, dct=dct)

In [41]:
len(corpus)

91966

In [42]:
[doc for doc in corpus][0]

[(992, 1), (1663, 1), (1742, 1), (2530, 1), (2948, 1)]