In [40]:
import numpy as np
import pandas as pd
import requests
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [23]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
PIPA_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1rV4t0_UV_wcx--UAHVwkqB8Wa_5n9mnpV05yGG1OHqk/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


# Load the main dataset of ancient Greek texts

In [2]:
cgl = pd.read_json("../data/large_data/cgl.json")
cgl.head()

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,...,lemmata,lemmata_wordcount,subcorpus,lemmata_repl,lemmatized_sentences_repl,count_πόνο*,count_ὀδύν*,count_ἄλγ*,count_λύπ*,conc_lype
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War,150118,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...",71863,,"[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελο...","[[θουκυδίδης, Ἀθηναῖος, συγγράφω, πόλεμος, Πελ...",31,0,6,25,"[[μέγας, κινδυνεύοντας, δέχομαι, ἀείμνηστος, μ..."
6,tlg0006.tlg001.perseus-grc2.xml,Euripides,Cyclops,4141,tlg0006,tlg0006.tlg001,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[Βρόμιος, ἔχω, πόνος, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...",2535,,"[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθενέ...","[[Βρόμιος, ἔχω, πόνο*, χὥτʼ, ἥβη, ἐμός, εὐσθεν...",7,0,0,1,"[[βοτόν, οὔτις, θύω, θεός, μέγας, γαστήρ, δαίμ..."
7,tlg0006.tlg004.perseus-grc2.xml,Euripides,Ἡρακλεῖδαι,6272,tlg0006,tlg0006.tlg004,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...",3545,,"[ποτός, εἰμί, οὗτος, δεδογμένον, δίκαιος, φύω,...","[[ποτός, εἰμί, οὗτος, δεδογμένον], [δίκαιος, φ...",11,0,1,2,"[[εἰμί, πολύς, χαίρω, δυσφημέω, ἅζομαι, θέα, σ..."
8,tlg0006.tlg005.perseus-grc2.xml,Euripides,Ἱππόλυτος,8257,tlg0006,tlg0006.tlg005,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...",4898,,"[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις, ...","[[πολύς, βροτός, ἀνώνυμος, θεά, καλέω, Κύπρις,...",8,3,8,8,"[[κοιτάζω, λέχος, σός, ναυβάτης, τὶς, πλέω, κρ..."
9,tlg0006.tlg006.perseus-grc2.xml,Euripides,Ἀνδρομάχη,7397,tlg0006,tlg0006.tlg006,5 B.C.,-4.5,{'-4.5': 1},-4.5,...,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...",4420,,"[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πολ...","[[Ἀσιανός, γῆ, σχῆμα, θηβαία, πόλις, ἑδνόω, πο...",5,0,7,4,"[[Τροία, πράσσω, μηδείς, ὅδε, αὐχέω, πράσσω, δ..."


In [3]:
sents = [sen for work in cgl["lemmatized_sentences_repl"] for sen in work]

In [5]:
len(sents)

256084

In [6]:
min_freq = 5

def get_vocab(docs, min_freq=5):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return words_flat, vocabulary # , vocab_freqs

In [10]:
words, vocabulary = get_vocab(sents)

In [14]:
len(words)

1680907

In [13]:
len(vocabaulary)

17703

In [15]:
# continuous bigrams & trigrams, crossing sentence divisions
bigrams = [list(ngram) for ngram in nltk.bigrams(words)]
trigrams = [list(ngram) for ngram in nltk.trigrams(words)]

In [17]:
# continuous bigrams & trigrams, within sentences only
sents_bigrams = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents] for el in sublist]

sents_trigrams = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents] for el in sublist]

In [38]:
def get_cooc(docs, vocabulary=None, min_freq=5):
    if vocabulary==None:
        lemmata, vocabulary = get_vocab(docs, min_freq)
    vec_bow = CountVectorizer(vocabulary=vocabulary, lowercase=False)
    bow = vec_bow.fit_transform([" ".join(list(set(sentence))) for sentence in docs])
    cooc  = bow.T * bow
    cooc = cooc / len(docs)
    return cooc, vocabulary

def normalize_ppmi3_matrix(pmi_matrix_df):
    minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
    diff = abs(maxval - minval)
    minval_doubled = minval - diff
    pmi_matrix_df.fillna(minval_doubled, inplace=True)
    pmi_matrix_norm_df = (pmi_matrix_df - minval_doubled) / (maxval - minval_doubled)
    return pmi_matrix_norm_df

def get_ppmi_df(cooc, vocabulary, normalize=True, exp=2):
    pmi_rows_list = []
    for i in range(cooc.shape[1]):
        ab = np.array([row_el for row_el in list(cooc[i].toarray()[0])], dtype=float)
        ab_exp = np.power(ab, exp)
        axb = np.array([cooc[row_el[0], row_el[0]] * cooc[i, i] for row_el in enumerate(list(cooc[i].toarray()[0]))], dtype=float)
        pmi_row = np.divide(ab_exp, axb, out=np.zeros_like(ab_exp), where=axb!=0)
        pmi_row = [np.log(n) if n>0 else None for n in pmi_row]
        pmi_rows_list.append(pmi_row)
    pmi_matrix_df = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)
    if normalize == True:
        pmi_matrix_df = normalize_ppmi3_matrix(pmi_matrix_df)
        np.fill_diagonal(pmi_matrix_df.to_numpy(), 1)
    return pmi_matrix_df

def svd_reduction(cooc_matrix, n_components=150, random_state=1, n_iter=10):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state, n_iter=n_iter)
    svd_matrix = svd.fit_transform(cooc_matrix)
    return svd_matrix

def from_docs_to_embeddings(docs, vocabulary=None, min_freq=5):
    cooc, vocabulary = get_cooc(docs, vocabulary=vocabulary, min_freq=min_freq)
    pmi_matrix = get_ppmi_df(cooc, vocabulary)
    word_vectors_array = svd_reduction(pmi_matrix, n_components=150, random_state=1, n_iter=10)
    word_vectors_df = pd.DataFrame(word_vectors_array, index=vocabulary)
    pmi_svd_cos = pd.DataFrame(cosine_similarity(word_vectors_array), columns=vocabulary, index=vocabulary)
    return [cooc, vocabulary, pmi_matrix, word_vectors_df, pmi_svd_cos]

In [39]:
%%time
data = from_docs_to_embeddings(sents + sents_bigrams + sents_trigrams , vocabulary=vocabulary[:5000])

CPU times: user 8min 36s, sys: 9.43 s, total: 8min 46s
Wall time: 8min 8s


In [47]:
data[0]

<5000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2862754 stored elements in Compressed Sparse Column format>

In [48]:
data[3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
εἰμί,40.921943,23.975110,6.083791,1.724597,0.787708,1.395476,10.085121,-3.512090,-2.466157,-0.479203,...,0.135525,0.116194,0.038779,0.028518,0.048852,0.063003,-0.006917,-0.285075,0.083432,0.050213
οὗτος,40.370998,23.444483,5.179671,1.611451,0.341264,1.735556,9.830402,-3.353345,-2.345492,-0.181414,...,0.139695,0.078060,0.127660,0.069392,0.060385,0.249957,0.070054,-0.194425,-0.004014,0.040241
λέγω,39.001855,22.120811,3.711950,0.918392,1.452316,0.077037,9.391213,-2.759150,-2.398183,0.214704,...,-0.031307,0.068023,0.000057,0.069156,0.111072,0.038494,0.002585,-0.019338,0.269553,-0.130885
ἔχω,39.804302,23.024422,6.199601,2.275942,0.625015,1.243543,9.147309,-3.485462,-2.581793,-0.275819,...,0.024101,0.063533,0.134114,0.179058,0.043486,0.113336,-0.036198,-0.342098,0.209940,-0.073624
γίγνομαι,39.347879,22.489298,5.427730,1.608740,-0.299176,1.453300,8.521668,-3.282159,-2.755167,-0.361946,...,0.017706,0.150147,-0.100498,0.024932,-0.241511,0.180704,-0.044182,-0.225451,-0.009623,0.082624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
κύαθος,2.306894,-1.252014,1.967848,0.639823,-0.830738,1.400087,0.623265,-0.022984,1.364765,-1.015133,...,0.029445,0.035054,-0.064084,-0.022086,-0.026919,-0.029758,-0.059165,0.081717,0.070341,0.065586
σκυτεύς,2.426349,-1.921372,0.401636,-0.442950,-0.050278,0.079601,1.183014,-0.539222,0.169868,0.727299,...,-0.379041,0.069706,0.277146,0.071542,-0.010878,0.112946,-0.114023,0.315502,-0.062303,-0.036280
παραμίγνυμι,2.460930,-1.073599,2.207248,0.563545,-0.814028,1.938490,0.646300,-0.572682,2.623897,-1.114314,...,-0.005971,-0.170639,-0.140238,0.136425,0.311922,0.270307,-0.047148,-0.198601,-0.085457,0.311602
ὀφθαλμία,1.968229,-1.475017,1.460325,-0.047004,-0.329035,0.401820,0.145133,-0.578576,-0.902634,0.095910,...,-0.229633,-0.162759,-0.336879,-0.035314,0.135483,-0.145982,0.128570,-0.032265,-0.125454,-0.178760


In [45]:
pickle.dump(data, open("../data/large_data/cgl_embeddings.pkl", "wb"), pickle.HIGHEST_PROTOCOL)