In [33]:
import numpy as np
import pandas as pd
import requests
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from cltk.stops.grc import STOPS as stopwords
import unicodedata

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [3]:
cgl = pd.read_json("../data/large_data/cgl.json")
cgl.head()

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,count_λυπέω,count_λυπηρός,count_λύπη,count_ἄλγος,count_ἄλγημα,count_ἀλγέω,count_ὀδύνη,count_ὀδυνάω,count_πονέω,count_πόνος
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,10,11,4,0,0,2,0,0,10,21
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,1,0,0,0,0,0,0,0,0,7
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,2,0,0,0,0,0,1,10
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,1,5,4,0,0,3,1,1,7
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,0,0,4,0,0,5,0,0,1,4


In [4]:
sents = [sen for work in cgl["lemmatized_sentences_repl"] for sen in work]

In [5]:
len(sents)

243579

In [52]:
min_freq = 50

def get_vocab(docs, min_freq=min_freq):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return word_freq_tups, words_flat, vocabulary # , vocab_freqs

In [53]:
word_freqs, words, vocabulary = get_vocab(sents)

In [54]:
word_freqs[:10]

[('εἰμί', 79567),
 ('οὗτος', 60113),
 ('αὐτός', 42056),
 ('λέγω', 29114),
 ('τις', 27416),
 ('ἔχω', 22584),
 ('γίγνομαι', 21737),
 ('πολύς', 20652),
 ('πᾶς', 19062),
 ('ἄλλος', 18361)]

In [40]:
len(words)

1828293

In [41]:
len(vocabulary)

3494

In [42]:
print(vocabulary[:20])

['εἰμί', 'οὗτος', 'αὐτός', 'λέγω', 'τις', 'ἔχω', 'γίγνομαι', 'πολύς', 'πᾶς', 'ἄλλος', 'ποιέω', 'φημί', 'οὐδείς', 'τίς', 'δέω', 'λόγος', 'τοιοῦτος', 'πόλις', 'ἀγαθός', 'δοκέω']


In [43]:
def normalize_encoding(string):
    return unicodedata.normalize("NFC", string)

In [44]:
stopwords = [normalize_encoding(w) for w in stopwords] + ["εἰμί", "οὗτος", "αὐτός", "αὐτός", "τις", "τίς"]

In [61]:
vocabulary = [w for w in vocabulary if w not in stopwords]
vocabulary[:10]

['λέγω',
 'ἔχω',
 'γίγνομαι',
 'πολύς',
 'πᾶς',
 'ποιέω',
 'φημί',
 'δέω',
 'λόγος',
 'πόλις']

In [62]:
len(vocabulary)

3480

In [48]:
# continuous bigrams & trigrams, within sentences only
sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]

sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]

In [49]:
ngrams_data = sents + sents_bigrams + sents_trigrams
len(ngrams_data)

3232119

In [50]:
with open("../data/large_data/corpus_ngrams.txt", "w", encoding="utf-8") as f:
    f.writelines("\n".join([" ".join(ngram) for ngram in ngrams_data]))

In [63]:
pickle.dump(vocabulary, open("../data/vocabulary.pickle", "wb"))

In [59]:
pickle.dump(word_freqs, open("../data/word_freqs_all.pickle", "wb"))

In [66]:
word_freqs_vocabulary = pd.DataFrame([tup for tup in word_freqs if tup[0] in vocabulary], columns=["word", "count"])

In [68]:
word_freqs_vocabulary.to_csv("../data/word_freqs_vocabulary.csv", index=False)