In [129]:
import numpy as np
import pandas as pd
import requests
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
from gensim.corpora import Dictionary
import unicodedata

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [80]:
cgl = pd.read_json("../data/large_data/cgl.json")
cgl.head()

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,count_λυπέω,count_λυπηρός,count_λύπη,count_ἄλγος,count_ἄλγημα,count_ἀλγέω,count_ὀδύνη,count_ὀδυνάω,count_πονέω,count_πόνος
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,10,11,4,0,0,2,0,0,10,21
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,1,0,0,0,0,0,0,0,0,7
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,2,0,0,0,0,0,1,10
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,1,5,4,0,0,3,1,1,7
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,4,0,0,5,0,0,1,4


In [81]:
sents = [sen for work in cgl["lemmatized_sentences_repl"] for sen in work]

In [82]:
len(sents)

243579

In [83]:
min_freq = 50

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary # , vocab_freqs

In [84]:
word_freqs, words, vocabulary = get_vocab(sents)

In [85]:
word_freqs[:10]

[('εἰμί', 79567),
 ('οὗτος', 60113),
 ('αὐτός', 42056),
 ('λέγω', 29114),
 ('τις', 27416),
 ('ἔχω', 22584),
 ('γίγνομαι', 21737),
 ('πολύς', 20652),
 ('πᾶς', 19062),
 ('ἄλλος', 18361)]

In [86]:
len(words)

1828293

In [87]:
len(vocabulary)

3494

In [88]:
print(vocabulary[:20])

['εἰμί', 'οὗτος', 'αὐτός', 'λέγω', 'τις', 'ἔχω', 'γίγνομαι', 'πολύς', 'πᾶς', 'ἄλλος', 'ποιέω', 'φημί', 'οὐδείς', 'τίς', 'δέω', 'λόγος', 'τοιοῦτος', 'πόλις', 'ἀγαθός', 'δοκέω']


In [89]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)

In [90]:
stopwords = [normalize_encoding(w) for w in stopwords] + ["εἰμί", "οὗτος", "αὐτός", "αὐτός", "τις", "τίς"]

In [91]:
vocabulary = [w for w in vocabulary if w not in stopwords]
vocabulary[:10]

['λέγω',
 'ἔχω',
 'γίγνομαι',
 'πολύς',
 'πᾶς',
 'ποιέω',
 'φημί',
 'δέω',
 'λόγος',
 'πόλις']

In [92]:
len(vocabulary)

3480

In [93]:
# continuous bigrams & trigrams, within sentences only
sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]

sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]

In [94]:
ngrams_data = sents + sents_bigrams + sents_trigrams
len(ngrams_data)

3232119

In [95]:
author_ids = list(set(cgl["author_id"]))
author_ids[:5]

['tlg0030', 'tlg0088', 'tlg0535', 'tlg0591', 'tlg0017']

In [50]:
with open("../data/large_data/corpus_ngrams.txt", "w", encoding="utf-8") as f:

    f.writelines("\n".join([" ".join(ngram) for ngram in ngrams_data]))

In [63]:
pickle.dump(vocabulary, open("../data/vocabulary.pickle", "wb"))

In [59]:
pickle.dump(word_freqs, open("../data/word_freqs_all.pickle", "wb"))

In [66]:
word_freqs_vocabulary = pd.DataFrame([tup for tup in word_freqs if tup[0] in vocabulary], columns=["word", "count"])

In [68]:
word_freqs_vocabulary.to_csv("../data/word_freqs_vocabulary.csv", index=False)

# Ngrams by doc ids

In [97]:
doc_ids = list(set(cgl["doc_id"]))
len(doc_ids)

371

In [100]:
line = 0
ids_lines = {}

f = open("../data/large_data/corpus_ngrams_bydocid.txt", "w", encoding="utf-8")
for doc_id in doc_ids:
    cgl_subset = cgl[cgl["doc_id"]==doc_id]
    sents = [sen for work in cgl_subset["lemmatized_sentences_repl"] for sen in work]
    sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]
    sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]
    ngrams_data = sents + sents_bigrams + sents_trigrams
    ngrams_data = [" ".join(ngram) for ngram in ngrams_data]
    f.writelines("\n".join(ngrams_data)+"\n")
    if bool(ngrams_data):
        ids_lines[doc_id] = (line, line+len(ngrams_data))
        line += len(ngrams_data)
    else:
        ids_lines[doc_id] = (line, line+1)
        line += 1

In [101]:
ids_lines

{'tlg0535.tlg001': (0, 1875),
 'tlg0010.tlg029': (1875, 2627),
 'tlg0030.tlg001': (2627, 3915),
 'tlg0086.tlg042': (3915, 6500),
 'tlg0627.tlg008': (6500, 8188),
 'tlg0540.tlg025': (8188, 10146),
 'tlg0019.tlg011': (10146, 16941),
 'tlg0086.tlg038': (16941, 52767),
 'tlg0014.tlg031': (52767, 53692),
 'tlg0006.tlg001': (53692, 57846),
 'tlg0003.tlg001': (57846, 201269),
 'tlg0627.tlg046': (201269, 202606),
 'tlg0064.tlg001': (202606, 203310),
 'tlg0014.tlg013': (203310, 205534),
 'tlg0540.tlg013': (205534, 210253),
 'tlg0006.tlg004': (210253, 216090),
 'tlg0014.tlg001': (216090, 217824),
 'tlg0027.tlg001': (217824, 227155),
 'tlg0627.tlg022': (227155, 228566),
 'tlg0059.tlg035': (228566, 234866),
 'tlg0627.tlg001': (234866, 239997),
 'tlg0014.tlg022': (239997, 245344),
 'tlg0540.tlg034': (245344, 245877),
 'tlg0086.tlg025': (245877, 312936),
 'tlg0540.tlg002': (312936, 317018),
 'tlg0026.tlg004': (317018, 320763),
 'tlg0086.tlg013': (320763, 334768),
 'tlg0014.tlg004': (334768, 337883),

In [143]:
pickle.dump(ids_lines, open("../data/ids_lines.pickle", "wb"))

# Testing that the indexing works

In [104]:
with open("../data/large_data/corpus_ngrams_bydocid.txt", "r") as f:
    lines = f.readlines()
    doc_vals = ids_lines["tlg0086.tlg014"]
    for line in lines[doc_vals[0]:doc_vals[1]][:10]:
        if bool(line.split()):
            print(line.split())

['ζῷον', 'μόριον', 'εἰμί', 'ἀσύνθετος', 'διαιρέω', 'ὁμοιομερής', 'σάρξ', 'σάρξ', 'σύνθετος', 'ἀνομοιομερής', 'χείρ', 'χείρ', 'διαιρέω', 'πρόσωπον', 'πρόσωπον']
['τοιοῦτος', 'ἔνιοι', 'μέρος', 'μέλος', 'καλέω']
['τοιοῦτος', 'εἰμί', 'μέρος', 'ὅλος', 'εἰμί', 'ἕτερος', 'μέρος', 'ἔχω', 'κεφαλή', 'σκέλος', 'χείρ', 'ὅλος', 'βραχίων', 'θώραξ']
['οὗτος', 'αὐτός', 'εἰμί', 'μέρος', 'ὅλος', 'εἰμί', 'αὐτός', 'ἕτερος', 'μόριον']
['πᾶς', 'ἀνομοιομερής', 'σύγκειμαι', 'ὁμοιομερής', 'χείρ', 'σάρξ', 'νεῦρον', 'ὀστέον']
['ἔχω', 'ζῷον', 'ἔνιοι', 'πᾶς', 'μόριος', 'αὐτός', 'ἔνιοι', 'ἕτερος']
['αὐτός', 'εἶδος', 'μόριον', 'εἰμί', 'ἄνθρωπος', 'ῥίς', 'ὀφθαλμός', 'ἄνθρωπος', 'ῥίς', 'ὀφθαλμός', 'σάρξ', 'σάρξ', 'ὀστέον', 'ὀστέον']
['αὐτός', 'τρόπος', 'ἵππος', 'ἄλλος', 'ζῷον', 'εἶδος', 'αὐτός', 'λέγω']
['ὅλος', 'ἔχω', 'ὅλος', 'μόριος', 'ἔχω', 'ἕκαστος', 'ἕκαστος']
['αὐτός', 'εἰμί', 'διαφέρω', 'ὑπεροχή', 'ἔλλειψις', 'γένος', 'εἰμί', 'αὐτός']


In [106]:
cgl[cgl["doc_id"]=="tlg0086.tlg014"]["lemmatized_sentences"]

694    [[ζῷον, μόριον, εἰμί, ἀσύνθετος, διαιρέω, ὁμοι...
Name: lemmatized_sentences, dtype: object

In [107]:
c_aristotelicum_ids = cgl[cgl["subcorpus"]=="c_aristotelicum"]["doc_id"].tolist()
c_aristotelicum_ids[:5]

['tlg0086.tlg001',
 'tlg0086.tlg002',
 'tlg0086.tlg005',
 'tlg0086.tlg006',
 'tlg0086.tlg008']

In [108]:
ids_list = c_aristotelicum_ids[:5]

In [109]:
class NgramCorpus:
    def __iter__(self):
        with open("../data/large_data/corpus_ngrams_bydocid.txt", "r") as f:
            lines = f.readlines()
            for phi_id in ids_list:
                for line in lines[ids_lines[phi_id][0] : ids_lines[phi_id][1]]:
                    if bool(line.split()):
                        yield line.split()

In [110]:
corpus = NgramCorpus()
[doc for doc in corpus][:5]

[['πρῶτος',
  'λέγω',
  'τίς',
  'τίς',
  'εἰμί',
  'σκέψις',
  'ἀπόδειξις',
  'ἐπιστήμη',
  'ἀποδεικτικός'],
 ['διορίζω',
  'τίς',
  'εἰμί',
  'πρότασις',
  'τίς',
  'ὅρος',
  'τίς',
  'συλλογισμός',
  'ποῖος',
  'τέλειος',
  'ποῖος',
  'ἀτελής',
  'οὗτος',
  'τίς',
  'ὅλος',
  'εἰμί',
  'εἰμί',
  'ὅδε',
  'ὅδε',
  'τίς',
  'λέγω',
  'πᾶς',
  'μηδείς',
  'κατηγορέω'],
 ['πρότασις', 'εἰμί', 'λόγος', 'καταφατικός', 'ἀποφατικός', 'τις', 'τις'],
 ['οὗτος', 'μέρος', 'ἀδιόριστος'],
 ['λέγω',
  'πᾶς',
  'μηδείς',
  'ὑπάρχω',
  'μέρος',
  'τις',
  'τις',
  'πᾶς',
  'ὑπάρχω',
  'ἀδιόριστος',
  'ὑπάρχω',
  'ὑπάρχω',
  'μέρος',
  'ἐναντίος',
  'εἰμί',
  'αὐτός',
  'ἐπιστήμη',
  'ἡδονή',
  'εἰμί',
  'ἀγαθός']]

In [130]:
dct = Dictionary([vocabulary])

In [134]:
test_doc = [doc for doc in corpus][0]
print(test_doc)

['πρῶτος', 'λέγω', 'τίς', 'τίς', 'εἰμί', 'σκέψις', 'ἀπόδειξις', 'ἐπιστήμη', 'ἀποδεικτικός']


In [135]:
dct.doc2bow(test_doc)

[(1008, 1), (1680, 1), (1759, 1), (2547, 1), (2965, 1)]

In [163]:
class NgramCorpus(object):
    def __init__(self, ids_list, ids_lines, fname, bow=False, dct=None):
        self.ids_list = ids_list
        self.fname = fname
        self.bow  = bow
        self.dct = dct
        self.ids_lines = ids_lines
        self.len = 0
    def __len__(self):
        [el for el in self]
        return self.len
    def __iter__(self):
        with open(self.fname, "r") as f:
            lines = f.readlines()
            for id in self.ids_list:
                for line in lines[self.ids_lines[id][0] : self.ids_lines[id][1]]:
                    if bool(line.split()):
                        self.len += 1
                        if (self.bow) & (self.dct != None):
                            yield self.dct.doc2bow(line.split())
                        else:
                            yield line.split()

In [164]:
corpus = NgramCorpus(c_aristotelicum_ids[:4], ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt", bow=True, dct=dct)

In [165]:
len(corpus)

99529

In [166]:
[doc for doc in corpus][0]

[(1008, 1), (1680, 1), (1759, 1), (2547, 1), (2965, 1)]