In [228]:
import math
import numpy as np
import pandas as pd
import pickle
import nltk

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy import spatial

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora
import sddk

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

In [229]:
# for exporting data to googlesheets/sciencedata
# (feel free to skip)

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
paul_results = gc.open_by_url("https://docs.google.com/spreadsheets/d/1h4M-gK9TPIfeTV528tUuPBfZF1wtcNCA10yIlJYqGTE/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [230]:
LAGTec = pd.read_json("../data/large_files/LAGTec.json")

In [231]:
len(LAGTec["author"].unique().tolist())

49

In [232]:
len(LAGTec)

148

In [233]:
LAGTec["wordcount"].sum()

3566823

In [234]:
LAGTec["lemmatized_sentences"].apply(lambda x: len([w for s in x for w in s])).sum()

1498327

In [235]:
LAGTec

Unnamed: 0,filename,author,title,wordcount,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,tlg_epithet,clean_string,n_sentences,lemmatized_sentences,sentences_paul,sentences_paul_N,sentences_apostle,paul_N,sentences_apostle_N
385,tlg0031.tlg001.perseus-grc2.xml,Gospel of Matthew,Gospel of Matthew,18288,tlg0031a,tlg0031.tlg001,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΒΙΒΛΟΣ γενέσεως Ἰησοῦ Χριστοῦ υἱοῦ Δαυεὶδ υἱο...,1276,"[[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυί...",[],0,"[[ἀπόστολος, ὄνομα, εἰμί]]",0,1
386,tlg0031.tlg002.perseus-grc2.xml,Gospel of Mark,Gospel of Mark,11274,tlg0031b,tlg0031.tlg002,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΑΡΧΗ τοῦ εὐαγγελίου Ἰησοῦ Χριστοῦ . Καθὼς γέγ...,790,"[[ἀρχή, εὐαγγέλιον, Ἰησοῦς, Χριστός], [γράφω, ...",[],0,"[[ποιέω, ἀπόστολος, ὀνομάζω, ἀποστέλλω, κηρύσσ...",0,2
387,tlg0031.tlg003.perseus-grc2.xml,Luke-Acts,Gospel of Luke,19458,tlg0031luke,tlg0031.tlg003,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],ΕΠΕΙΔΗΠΕΡ ΠΟΛΛΟΙ ἐπεχείρησαν ἀνατάξασθαι διήγ...,1274,"[[πολύς, ἐπιχειρέω, ἀνατάσσομαι, διήγησις, πλη...",[],0,"[[γίγνομαι, ἡμέρα, προσφωνέω, μαθητής, ἐκλεξάμ...",0,6
388,tlg0031.tlg004.perseus-grc2.xml,Johnannine literature (New Testament),Gospel of John,15590,tlg0031john,tlg0031.tlg004,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],"ΕΝ ΑΡΧΗ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θ...",1164,"[[εἰμί, ἀρχή, εἰμί, λόγος, λόγος, εἰμί, θεός, ...",[],0,"[[λέγω, εἰμί, δοῦλος, μέγας, κύριος, ἀπόστολος...",0,1
389,tlg0031.tlg005.perseus-grc2.xml,Luke-Acts,Acts,18406,tlg0031luke,tlg0031.tlg005,A.D. 1,0.5,{'0.5': 1},0.5,christian,[],"τὸν μὲν πρῶτον λόγον ἐποιησάμην περὶ πάντων, ...",960,"[[πρῶτος, λόγος, ποιέω, πᾶς, Θεόφιλος, ἄρχω, Ἰ...","[[διέρχομαι, ὅλος, νῆσος, Πάφος, εὑρίσκω, ἀνήρ...",125,"[[πρῶτος, λόγος, ποιέω, πᾶς, Θεόφιλος, ἄρχω, Ἰ...",128,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,tlg4081.tlg001.perseus-grc1.xml,Colluthus,Rape of Helen,2336,tlg4081,tlg4081.tlg001,A.D. 5,4.5,{'4.5': 1},,christian,Epici/-ae,"νύμφαι Τρωιάδες, ποταμοῦ Ξάνθοιο γενέθλη, αἳ π...",209,"[[νύμφη, Τρῳάς, ποταμός, Ξάνθος, γενέθλη, πλόκ...",[],0,[],0,0
1418,tlg4084.tlg001.opp-grc1.xml,Zosimus,Historia Nova,62236,tlg4084,tlg4084.tlg001,A.D. 5?,4.5,{'4.5': 1},4.5,christian,Historici/-ae,"Πολυβίῳ τῷ Μεγαλοπολίτῃ, μνήμῃ παραδοῦναι τὰ ...",2265,"[[πολυβίῳ, μεγαλοπολίτῃ, μνήμη, παραδίδωμι, ἀξ...",[],0,[],0,0
1421,tlg4089.tlg003.opp-grc1.xml,Theodoret,Historia ecclesiastica,108815,tlg4089,tlg4089.tlg003,A.D. 4-5,4.5,"{'3.5': 0.5, '4.5': 0.5}",4.5,christian,Theologici,Τάδε ἔνεστιν ἐν τῷ πρώτῳ τόμῳ τῆς Θεοδωρήτου ἐ...,13146,"[[ἔνειμι, πρῶτος, τόμῳ, θεοδωρήτου, ἐκκλησιαστ...","[[προτρεπόμενον, Παῦλος, οὗτος, βιάζω, Πέτρος,...",31,"[[εὐσεβίος, παλαιστῖνος, ἱερόν, ἀπόστολος, ἱστ...",33,23
1422,tlg4089.tlg004.1st1K-grc1.xml,Theodoret,Historia Religiosa,44518,tlg4089,tlg4089.tlg004,A.D. 4-5,4.5,"{'3.5': 0.5, '4.5': 0.5}",4.5,christian,Theologici,"Τῶν ἀρίστων ἀνδρῶν, καὶ τῆς ἀρετῆς ἀθλητῶν, κ...",2948,"[[ἄριστος, ἀνήρ, ἀρετή, ἀθλητής, καλός, ὁράω, ...","[[τοιοῦτος, παντευχία, φάλαγξ, στρατηγός, πρόμ...",9,"[[ῥηθησομένων, ἀπιστέω, εὔδηλος, μωσέως, Ἰησοῦ...",9,13


In [236]:
LAGTec_by_date = pd.DataFrame(LAGTec.groupby("date_avr").sum())
LAGTec_by_date.reset_index(inplace=True)

LAGTec_by_date["authors_N"] = LAGTec_by_date["date_avr"].apply(lambda x: len(LAGTec[LAGTec["date_avr"]==x]["author"].unique()))

LAGTec_by_date["works_N"] = LAGTec_by_date["date_avr"].apply(lambda x: (LAGTec["date_avr"]==x).sum())

LAGTec_by_date["paul_freq"] = LAGTec_by_date["paul_N"] / LAGTec_by_date["wordcount"]

LAGTec_by_date[["date_avr", "authors_N", "wordcount", "n_sentences", "paul_N", "paul_freq", "sentences_paul_N", "sentences_apostle_N"]]

Unnamed: 0,date_avr,authors_N,wordcount,n_sentences,paul_N,paul_freq,sentences_paul_N,sentences_apostle_N
0,0.5,13,151723,10108,160,0.001055,156,84
1,1.5,13,322401,41076,18,5.6e-05,18,92
2,2.5,4,1237867,129474,497,0.000401,485,644
3,3.5,9,1347705,117599,213,0.000158,205,690
4,4.5,10,507127,45061,184,0.000363,181,146


In [237]:
sents_1to3 = [sen for work in LAGTec[LAGTec["date_avr"]<3]["lemmatized_sentences"] for sen in work]
sents_4to5 = [sen for work in LAGTec[LAGTec["date_avr"]>3]["lemmatized_sentences"] for sen in work]

In [238]:
len(sents_1to3)

180658

In [239]:
print(sents_1to3[:3])

[['βίβλος', 'γένεσις', 'Ἰησοῦς', 'Χριστός', 'υἱός', 'Δαυίδ', 'υἱός', 'Ἀβραάμ'], ['Ἀβραάμ', 'γεννάω', 'Ἰσαάκ', 'Ἰσαάκ', 'γεννάω', 'Ἰακώβ', 'Ἰακώβ', 'γεννάω', 'Ἰούδας', 'ἀδελφός', 'Ἰούδας', 'γεννάω', 'Φάρες', 'Ζάρα', 'Θαμάρ', 'Φάρες', 'γεννάω', 'Ἑσρώμ', 'Ἑσρώμ', 'γεννάω', 'Ἀράμ', 'Ἀράμ', 'γεννάω', 'Ἀμιναδάβ', 'Ἀμιναδάβ', 'γεννάω', 'Ναασσών', 'Ναασσών', 'γεννάω', 'Σαλμών', 'Σαλμών', 'γεννάω', 'βοῦς', 'Ῥαχάβ', 'βοῦς', 'γεννάω', 'Ἰωβήδ', 'Ῥούθ', 'Ἰωβήδ', 'γεννάω', 'Ἰεσσαί', 'Ἰεσσαί', 'γεννάω', 'Δαυίδ', 'βασιλεύς'], ['Δαυίδ', 'γεννάω', 'Σολομών', 'Οὐρίας', 'Σολομών', 'γεννάω', 'Ῥοβοάμ', 'Ῥοβοάμ', 'γεννάω', 'Ἀβιά', 'Ἀβιά', 'γεννάω', 'Ἀσάφ', 'Ἀσάφ', 'γεννάω', 'Ἰωσαφάτ', 'Ἰωσαφάτ', 'γεννάω', 'Ἰωράμ', 'Ἰωράμ', 'γεννάω', 'Ὀζίας', 'Ὀζίας', 'γεννάω', 'Ἰωαθάμ', 'Ἰωαθάμ', 'γεννάω', 'ἄχας', 'ἄχας', 'γεννάω', 'Ἑζεκίας', 'Ἑζεκίας', 'γεννάω', 'Μανασσῆς', 'Μανασσῆς', 'γεννάω', 'Ἀμώς', 'Ἀμώς', 'γεννάω', 'Ἰωσίας', 'Ἰωσίας', 'γεννάω', 'Ἰεχονίας', 'ἀδελφός', 'μετοικεσία', 'Βαβυλών']]


In [240]:
min_freq = 5

def get_vocab(docs, min_freq=5):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return words_flat, vocabulary # , vocab_freqs

In [241]:
words_flat_1to3, vocabulary_1to3 = get_vocab(sents_1to3)
words_flat_4to5, vocabulary_4to5 = get_vocab(sents_4to5)

In [242]:
print("Παῦλος" in vocabulary_1to3)
print("Παῦλος" in vocabulary_4to5)
print("ἀπόστολος" in vocabulary_1to3)
print("ἀπόστολος" in vocabulary_4to5)

True
True
True
True


In [243]:
words_flat_1to3[-100:]

['ὄμμα',
 'μένω',
 'ἀπειλή',
 'σκώληξ',
 'σῶμα',
 'ἀπουσία',
 'ἐπιστρεφόμενον',
 'ἐκβράσαν',
 'σῶμα',
 'ἐπιστρεφής',
 'οὗτος',
 'ἐκφεύγω',
 'θεός',
 'εἰμί',
 'διδαχθείς',
 'ἔχω',
 'ἀθάνατος',
 'σῶμα',
 'ἄφθαρτ',
 'ψυχή',
 'βασιλεία',
 'οὐρανός',
 'ἀπόληψις',
 'γῆ',
 'βίος',
 'ἐπουράνιος',
 'βασιλεύς',
 'ἐπιγνούς',
 'εἰμί',
 'ὁμιλητής',
 'θεός',
 'συγκληρονόμος',
 'Χριστός',
 'ἐπιθυμία',
 'πάθος',
 'νόσος',
 'δουλούμενος',
 'γίγνομαι',
 'θεός',
 'ὑπομένω',
 'πάθος',
 'ἄνθρωπος',
 'εἰμί',
 'δίδωμι',
 'ἄνθρωπος',
 'εἷς',
 'ὅσος',
 'παρακολουθέω',
 'θεός',
 'παρέχω',
 'ἐπαγγέλλομαι',
 'θεός',
 'ἐθεοποιήθης',
 'ἀθάνατος',
 'γεννηθείς',
 'τουτέστι',
 'γιγνώσκω',
 'ἐπιγιγνώσκω',
 'πεποιηκότα',
 'θεός',
 'ἐπιγιγνώσκω',
 'ἐπιγνωσθῆναι',
 'συμβαίνω',
 'καλουμένῳ',
 'φιλεχθρήσητε',
 'ἄνθρωπος',
 'παλινδρομεῖν',
 'διστάσητε',
 'Χριστός',
 'πᾶς',
 'θεός',
 'ἁμαρτία',
 'ἄνθρωπος',
 'ἀποπλύνω',
 'προστάσσω',
 'νέος',
 'παλαιός',
 'ἄνθρωπος',
 'ἀποτελῶν',
 'εἰκών',
 'καλέω',
 'ἀρχή',
 'τύπος',
 'ἐπιδε

In [244]:
print(len(words_flat_1to3))
print(len(words_flat_4to5))

693604
804723


In [245]:
# continuous bigrams & trigrams, crossing sentence divisions
bigrams_1to3 = [list(ngram) for ngram in nltk.bigrams(words_flat_1to3)]
bigrams_4to5 = [list(ngram) for ngram in nltk.bigrams(words_flat_4to5)]

trigrams_1to3 = [list(ngram) for ngram in nltk.trigrams(words_flat_1to3)]
trigrams_4to5 = [list(ngram) for ngram in nltk.trigrams(words_flat_4to5)]

In [246]:
# continuous bigrams & trigrams, within sentences only

sents_bigrams_1to3 = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents_1to3] for el in sublist]
sents_bigrams_4to5 = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents_4to5] for el in sublist]

sents_trigrams_1to3 = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents_1to3] for el in sublist]
sents_trigrams_4to5 = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents_4to5] for el in sublist]

In [247]:
def get_cooc(docs, vocabulary=None, min_freq=5):
    if vocabulary==None:
        lemmata, vocabulary = get_vocab(docs, min_freq)
    vec_bow = CountVectorizer(vocabulary=vocabulary, lowercase=False)
    bow = vec_bow.fit_transform([" ".join(list(set(sentence))) for sentence in docs])
    cooc  = bow.T * bow
    cooc = cooc / len(docs)
    return cooc, vocabulary

In [248]:
cooc_1to3, vocabulary_1to3 = get_cooc(sents_1to3 + sents_bigrams_1to3 + sents_trigrams_1to3, vocabulary=vocabulary_1to3[:2000])
cooc_4to5, vocabulary_4to5 = get_cooc(sents_4to5 + sents_bigrams_4to5 + sents_trigrams_4to5, vocabulary=vocabulary_4to5[:2000])

In [249]:
cooc_4to5.todense().shape

(2000, 2000)

In [250]:
pd.DataFrame(cooc_4to5.todense())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.051753,0.002478,0.002941,0.002899,0.002043,0.001316,0.001782,1.944769e-03,0.000853,0.001017,...,0.000007,0.000007,0.000009,0.000006,0.000008,0.000009,0.000003,0.000006,0.000007,0.000010
1,0.002478,0.037633,0.002254,0.001486,0.001196,0.001482,0.001197,1.032084e-03,0.001139,0.001009,...,0.000005,0.000009,0.000008,0.000005,0.000004,0.000001,0.000006,0.000020,0.000007,0.000004
2,0.002941,0.002254,0.035753,0.001970,0.001334,0.001110,0.001106,9.653018e-04,0.000860,0.000654,...,0.000002,0.000003,0.000006,0.000006,0.000011,0.000001,0.000003,0.000005,0.000004,0.000001
3,0.002899,0.001486,0.001970,0.035827,0.001158,0.001047,0.001493,2.068889e-03,0.000650,0.000535,...,0.000000,0.000007,0.000005,0.000004,0.000005,0.000005,0.000002,0.000012,0.000003,0.000007
4,0.002043,0.001196,0.001334,0.001158,0.027645,0.000940,0.000894,8.364599e-04,0.000716,0.000523,...,0.000005,0.000005,0.000003,0.000004,0.000011,0.000004,0.000000,0.000010,0.000003,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000009,0.000001,0.000001,0.000005,0.000004,0.000007,0.000003,2.023693e-06,0.000004,0.000001,...,0.000000,0.000000,0.000001,0.000000,0.000000,0.000172,0.000000,0.000000,0.000000,0.000000
1996,0.000003,0.000006,0.000003,0.000002,0.000000,0.000002,0.000009,6.745645e-07,0.000004,0.000002,...,0.000000,0.000000,0.000001,0.000000,0.000000,0.000000,0.000175,0.000000,0.000000,0.000000
1997,0.000006,0.000020,0.000005,0.000012,0.000010,0.000007,0.000008,3.372822e-06,0.000003,0.000006,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000179,0.000000,0.000000
1998,0.000007,0.000007,0.000004,0.000003,0.000003,0.000005,0.000000,2.698258e-06,0.000006,0.000006,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000171,0.000000


In [251]:
abs(1-2)

1

In [252]:
def normalize_ppmi3_matrix(pmi_matrix_df):
    minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
    diff = abs(maxval-minval)
    minval_doubled = minval - diff
    pmi_matrix_df.fillna(minval_doubled, inplace=True)
    pmi_matrix_norm_df = (pmi_matrix_df - minval_doubled) / (maxval - minval_doubled)
    return pmi_matrix_norm_df

In [253]:
def get_ppmi_df(cooc, vocabulary, normalize=True, exp=2):
    pmi_rows_list = []
    for i in range(cooc.shape[1]):
        ab = np.array([row_el for row_el in list(cooc[i].toarray()[0])], dtype=float)
        ab_exp = np.power(ab, exp)
        axb = np.array([cooc[row_el[0], row_el[0]] * cooc[i, i] for row_el in enumerate(list(cooc[i].toarray()[0]))], dtype=float)
        pmi_row = np.divide(ab_exp, axb, out=np.zeros_like(ab_exp), where=axb!=0)
        pmi_row = [np.log(n) if n>0 else None for n in pmi_row]
        pmi_rows_list.append(pmi_row)
    pmi_matrix_df = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)
    if normalize == True:
        pmi_matrix_df = normalize_ppmi3_matrix(pmi_matrix_df)
        np.fill_diagonal(pmi_matrix_df.to_numpy(), 1)
    return pmi_matrix_df #pmi_matrix_norm_df

In [254]:
# test...
ppmi_matrix = get_ppmi_df(cooc_1to3, vocabulary_1to3, exp=2)
ppmi_matrix

Unnamed: 0,εἰμί,λέγω,θεός,οὗτος,γίγνομαι,λόγος,πᾶς,ἄνθρωπος,αὐτός,ἔχω,...,ἡσυχία,λαμπρότης,προνοέω,πανοῦργος,εἰλικρινής,ἀπέραντος,γήινος,φορά,ἔρομαι,ἀπαλλαξείω
εἰμί,1.000000,0.841726,0.852706,0.839353,0.813065,0.826493,0.828477,0.824406,0.826109,0.798337,...,0.689735,0.658195,0.664509,0.694085,0.654930,0.718115,0.634958,0.667352,0.579889,0.672080
λέγω,0.841726,1.000000,0.836828,0.851705,0.811609,0.817048,0.811355,0.806890,0.811251,0.804505,...,0.617892,0.638510,0.628448,0.697480,0.669748,0.697325,0.627975,0.659324,0.606365,0.601516
θεός,0.852706,0.836828,1.000000,0.817288,0.825953,0.848359,0.825664,0.831713,0.802677,0.802690,...,0.640372,0.672413,0.719462,0.535605,0.682876,0.574906,0.575217,0.670147,0.668301,0.577836
οὗτος,0.839353,0.851705,0.817288,1.000000,0.814568,0.814458,0.805414,0.799429,0.790974,0.803763,...,0.551642,0.628092,0.660263,0.611403,0.647907,0.650704,0.611559,0.657681,0.692985,0.697646
γίγνομαι,0.813065,0.811609,0.825953,0.814568,1.000000,0.815570,0.812350,0.823752,0.795756,0.768458,...,0.556836,0.633286,0.656682,0.664828,0.613645,0.664673,0.715492,0.591564,0.653298,0.681908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀπέραντος,0.718115,0.697325,0.574906,0.650704,0.664673,0.594190,0.619277,0.670634,0.664665,0.673144,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
γήινος,0.634958,0.627975,0.575217,0.611559,0.715492,0.665812,0.667819,0.622714,0.000000,0.602145,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.701664,0.000000,0.000000
φορά,0.667352,0.659324,0.670147,0.657681,0.591564,0.678008,0.665710,0.620605,0.600331,0.662572,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.701664,1.000000,0.000000,0.000000
ἔρομαι,0.579889,0.606365,0.668301,0.692985,0.653298,0.000000,0.000000,0.567646,0.695524,0.632692,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [255]:
ppmi_matrix.iloc[0].tolist()

[1.0,
 0.8417259347902348,
 0.8527064171708297,
 0.8393525199736201,
 0.8130654985445351,
 0.8264932484085157,
 0.8284765916530872,
 0.8244058473787145,
 0.8261088618438038,
 0.7983369610810938,
 0.8351656507158969,
 0.8039144636261258,
 0.7906642889500162,
 0.8040370659577757,
 0.8008991498471497,
 0.788098176006272,
 0.8114132676723685,
 0.8147423291113063,
 0.7940725729168044,
 0.7846359744460222,
 0.7905814534980671,
 0.8123283204054113,
 0.7872898111546219,
 0.7997746752061847,
 0.7837119476887718,
 0.7966395780806336,
 0.7721317599388023,
 0.7690495743734247,
 0.7996291678765748,
 0.7543331673117533,
 0.7844520812972746,
 0.8052786564531657,
 0.8146428340224805,
 0.7773320687268881,
 0.7624446944143198,
 0.7780829103601266,
 0.7861191275106968,
 0.7802573760055215,
 0.7819100708081018,
 0.7929088873639389,
 0.8007409940803454,
 0.7683264735716682,
 0.7785972361199373,
 0.7771794007549592,
 0.7923990850238581,
 0.7855423699128902,
 0.7621844377016366,
 0.767075442251016,
 0.762235

In [256]:
def svd_reduction(cooc_matrix, n_components=150, random_state=1, n_iter=10):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state, n_iter=n_iter)
    svd_matrix = svd.fit_transform(cooc_matrix)
    return svd_matrix

In [257]:
svd_matrix = pd.DataFrame(svd_reduction(ppmi_matrix))

In [258]:
# test compare random two vectors, both raw and SVD factorized
rows = [0,98]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.8041770380588669
0.8960039757859282


In [259]:
# test compare random two vectors, both raw and SVD factorized
rows = [0,1902]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.32822187615061504
0.49072821378686116


In [260]:
# test compare random two vectors, both raw and SVD factorized
rows = [1900,1902]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.19737694418771112
0.4725889256130317


In [261]:
ppmi_matrix.iloc[1]

εἰμί          0.841726
λέγω          1.000000
θεός          0.836828
οὗτος         0.851705
γίγνομαι      0.811609
                ...   
ἀπέραντος     0.697325
γήινος        0.627975
φορά          0.659324
ἔρομαι        0.606365
ἀπαλλαξείω    0.601516
Name: λέγω, Length: 2000, dtype: float64

In [262]:
def from_docs_to_embeddings(docs, vocabulary=None, min_freq=5):
    cooc, vocabulary = get_cooc(docs, vocabulary=vocabulary, min_freq=min_freq)
    pmi_matrix = get_ppmi_df(cooc, vocabulary)
    word_vectors_array = svd_reduction(pmi_matrix, n_components=150, random_state=1, n_iter=10)
    word_vectors_df = pd.DataFrame(word_vectors_array, index=vocabulary)
    pmi_svd_cos = pd.DataFrame(cosine_similarity(word_vectors_array), columns=vocabulary, index=vocabulary)
    return [cooc, vocabulary, pmi_matrix, word_vectors_df, pmi_svd_cos]

In [263]:
%%time
data_1to3 = from_docs_to_embeddings(sents_1to3 + bigrams_1to3, vocabulary=vocabulary_1to3[:2000])
data_4to5 = from_docs_to_embeddings(sents_4to5 + bigrams_4to5, vocabulary=vocabulary_4to5[:2000])

CPU times: user 3min 32s, sys: 6.95 s, total: 3min 39s
Wall time: 2min 31s


In [265]:
data_1to3[-1]

Unnamed: 0,εἰμί,λέγω,θεός,οὗτος,γίγνομαι,λόγος,πᾶς,ἄνθρωπος,αὐτός,ἔχω,...,ἡσυχία,λαμπρότης,προνοέω,πανοῦργος,εἰλικρινής,ἀπέραντος,γήινος,φορά,ἔρομαι,ἀπαλλαξείω
εἰμί,1.000000,0.999288,0.999294,0.999722,0.999305,0.998164,0.999014,0.997279,0.998943,0.998773,...,0.474521,0.470391,0.445691,0.477666,0.532048,0.440152,0.459184,0.509297,0.421453,0.496696
λέγω,0.999288,1.000000,0.998873,0.999411,0.998778,0.997648,0.998308,0.996880,0.998803,0.998188,...,0.477698,0.467932,0.445569,0.479577,0.530469,0.436899,0.458371,0.504757,0.420709,0.494920
θεός,0.999294,0.998873,1.000000,0.999209,0.998947,0.997723,0.999193,0.997571,0.998519,0.998238,...,0.476814,0.472447,0.448881,0.477274,0.534937,0.440680,0.459301,0.507311,0.419764,0.498160
οὗτος,0.999722,0.999411,0.999209,1.000000,0.999138,0.998462,0.998884,0.997453,0.999089,0.998700,...,0.476563,0.469958,0.446959,0.479029,0.533680,0.436753,0.457072,0.508451,0.425188,0.497860
γίγνομαι,0.999305,0.998778,0.998947,0.999138,1.000000,0.997912,0.998775,0.997369,0.998252,0.998238,...,0.475307,0.471920,0.450354,0.478051,0.534432,0.445533,0.459473,0.514109,0.421363,0.503012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ἀπέραντος,0.440152,0.436899,0.440680,0.436753,0.445533,0.443051,0.448155,0.444374,0.443021,0.448384,...,0.572712,0.557937,0.556021,0.505647,0.583232,1.000000,0.538857,0.552904,0.438155,0.541146
γήινος,0.459184,0.458371,0.459301,0.457072,0.459473,0.463842,0.461795,0.467277,0.455580,0.466384,...,0.515431,0.565640,0.591590,0.536741,0.565233,0.538857,1.000000,0.534928,0.486651,0.569768
φορά,0.509297,0.504757,0.507311,0.508451,0.514109,0.515548,0.515383,0.522376,0.514686,0.512192,...,0.483606,0.517621,0.559529,0.546601,0.590235,0.552904,0.534928,1.000000,0.528138,0.619039
ἔρομαι,0.421453,0.420709,0.419764,0.425188,0.421363,0.429373,0.423847,0.430688,0.429064,0.429997,...,0.580804,0.431847,0.580482,0.556738,0.516872,0.438155,0.486651,0.528138,1.000000,0.638222


In [266]:
pickle.dump(data_1to3, open("../data/large_files/embeddings_sents+bgs+tgs_ppmi2_1to3.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
pickle.dump(data_4to5, open("../data/large_files/embeddings_sents+bgs+tgs_ppmi2_4to5.pkl", "wb"), pickle.HIGHEST_PROTOCOL)