In [25]:
import pandas as pd
import geopandas as gpd
import re
import nltk
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
import pickle

In [2]:
GIST = gpd.read_parquet("../data/large_files/GIST_geocontexts.parquet")

In [3]:
GIST = GIST[GIST["random_dates"].notnull()]
len(GIST)

106898

In [4]:
GIST_lemmata_merged = [[l for l in inscr_lem["data"]] for inscr_lem in GIST["lemmata"].tolist()]
GIST_lemmata_merged = [l for i in GIST_lemmata_merged for l in i]
len(GIST_lemmata_merged)

1648240

In [5]:
freqs_tups = nltk.FreqDist(GIST_lemmata_merged).most_common()
freqs_tups[:100]

[('δῆμος', 14979),
 ('θεός', 13267),
 ('πόλις', 12561),
 ('ἄλλος', 11026),
 ('ἔτος', 9281),
 ('βουλή', 7875),
 ('ἔχω', 6723),
 ('υἱός', 6261),
 ('εἰμί', 6229),
 ('αὐτός', 6069),
 ('ἄρχω', 5996),
 ('μείς', 5943),
 ('δοκέω', 5934),
 ('δίδωμι', 5796),
 ('ποιέω', 5397),
 ('πᾶς', 5332),
 ('χαίρω', 5308),
 ('ἀγαθός', 5307),
 ('οὗτος', 5272),
 ('γίγνομαι', 5045),
 ('ἀνήρ', 4832),
 ('Ἀπόλλων', 4815),
 ('δραχμή', 4526),
 ('ἀνατίθημι', 4472),
 ('ἱερόν', 4217),
 ('βασιλεύς', 4086),
 ('γυνή', 4058),
 ('χάρις', 4042),
 ('Ζεύς', 4009),
 ('ἱερεύς', 3930),
 ('et', 3844),
 ('σεβαστός', 3752),
 ('μέγας', 3492),
 ('αὐτοκράτωρ', 3482),
 ('ἄρχων', 3389),
 ('νόμος', 3369),
 ('Καῖσαρ', 3332),
 ('εὐεργέτης', 3327),
 ('λέγω', 3288),
 ('ἴδιος', 3224),
 ('ἱερός', 3178),
 ('τιμή', 3097),
 ('Διονύσιος', 3087),
 ('ψήφισμα', 3044),
 ('ὁλκή', 3038),
 ('φιάλη', 2893),
 ('Ἀπολλώνιος', 2875),
 ('μήτηρ', 2843),
 ('τὶς', 2838),
 ('πατήρ', 2782),
 ('Ἀθηναῖος', 2757),
 ('χρύσεος', 2757),
 ('στήλη', 2627),
 ('στέφανος', 2591

In [7]:
match_pattern = "(κατα|ὑπό|ἐκ|ἀ|ἄ|ἀντί|^)δ[ι|ί]κ(α|ά|ο|ό|ί|έ|η)(.+)?"
dik_words = [word for word in GIST_lemmata_merged if re.search(match_pattern, word)]
dik_words_df = pd.DataFrame(nltk.FreqDist(dik_words).most_common(), columns=["lemma", "count"])
dik_words_df.head(5)

Unnamed: 0,lemma,count
0,δίκη,1153
1,δίκαιος,880
2,δικαστής,606
3,δικαιοσύνη,419
4,δικαστήριον,221


In [8]:
dik_words_df[:100]

Unnamed: 0,lemma,count
0,δίκη,1153
1,δίκαιος,880
2,δικαστής,606
3,δικαιοσύνη,419
4,δικαστήριον,221
...,...,...
95,δικαστάνς,3
96,δικαιοπραγήσαντα,3
97,δικάδδεν,3
98,δικαδδέτο,3


In [9]:
def get_ngrams(lemmata_data):
    lemmata_list = lemmata_data["data"]
    ngrams = []
    try:
        bigrams = list(nltk.ngrams(lemmata_list, 2))
        trigrams = list(nltk.ngrams(lemmata_list, 3, pad_right=True, pad_left=True))
        fivegrams = list(nltk.ngrams(lemmata_list, 5, pad_right=True, pad_left=True))
        ngrams_merged = bigrams + trigrams + fivegrams
        ngrams_merged = [[i for i in ngram if i is not None] for ngram in ngrams_merged]
        ngrams_merged = [ngram for ngram in ngrams_merged if (len(ngram) > 1)]
    except:
        ngrams_merged = [[]]
    return ngrams_merged


In [10]:
get_ngrams({"data" : ["word1", "word2", "word3", "word4"]})

[['word1', 'word2'],
 ['word2', 'word3'],
 ['word3', 'word4'],
 ['word1', 'word2'],
 ['word1', 'word2', 'word3'],
 ['word2', 'word3', 'word4'],
 ['word3', 'word4'],
 ['word1', 'word2'],
 ['word1', 'word2', 'word3'],
 ['word1', 'word2', 'word3', 'word4'],
 ['word1', 'word2', 'word3', 'word4'],
 ['word2', 'word3', 'word4'],
 ['word3', 'word4']]

In [11]:
%%time
GIST["ngrams"] = GIST["lemmata"].apply(get_ngrams)

CPU times: user 4.36 s, sys: 222 ms, total: 4.58 s
Wall time: 4.81 s


In [12]:
GIST.head(5)

Unnamed: 0,PHI_ID,reference,metadata,lines,text_raw,text_iphi,clean_text_conservative,clean_text_interpretive_word,lemmata,raw_date,...,TMgeo_name,geometry,name,polis_geo,polis_dist,polis?,duration,random_dates,greece?,ngrams
0,1,IG I³\n1,Att. — Ath.: Akr. — stoich. 35 — c. 510-500 a....,12.0,ἔδοχσεν το͂ι δέμοι· τ̣[ὸς ἐ Σ]αλαμ̣[ῖνι κλερόχ...,εδοχσεν τοι δεμοι τ[ος ε σ]αλαμ[ινι κλεροχ]ος ...,ἔδοχσεν το͂ι δέμοι ταλαμος οἰκε͂ν ἐᾶ Σαλαμῖνι ...,ἔδοχσεν το͂ι δέμοι τὸς ἐ Σαλαμῖνι κλερόχος οἰκ...,"{'data': ['ἔδοχσεν', 'δέμοι', 'Σαλαμίς', 'κλερ...",c. 510-500 a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,13.0,"[-510, -504, -510, -507, -506, -500, -508, -50...",True,"[[ἔδοχσεν, δέμοι], [δέμοι, Σαλαμίς], [Σαλαμίς,..."
1,4,IG I³\n4,Att. — stoich. 38 — 485/4 a.,56.0,[․․․․․․․․․․․․․․․․․․38․․․․․․․․․․․․․․․․․․]\n[․․․...,[--------------------------------------] [----...,δέ τις αν ἒ φρορὰν μ ντέκοντα δχμὰς τ ας ℎες π...,ἐὰν δέ τις αν ἒ φρορὰν μὲ πεντέκοντα δραχμὰς τ...,"{'data': ['τὶς', 'φρορὰν', 'πεντέκοντα', 'δραχ...",485/4 a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,2.0,"[-485, -485, -485, -484, -485, -484, -485, -48...",True,"[[τὶς, φρορὰν], [φρορὰν, πεντέκοντα], [πεντέκο..."
2,5,IG I³\n5,Att. — c. 500 a.,6.0,[ἔδοχσε]ν [⋮ τε͂ι βολε͂ι] ⋮ καὶ [τ]ο͂ι δέμοι ⋮...,[εδοχσε]ν [ τει βολει] και [τ]οι δεμοι οτε παρ...,ν καὶ ο͂ι δέμοι ℎότε Παραιβάτες λεια θν τὸς ℎι...,ἔδοχσεν τε͂ι βολε͂ι καὶ το͂ι δέμοι ℎότε Παραιβ...,"{'data': ['ἔδοχσεν', 'τει', 'βολει', 'δέμοι', ...",c. 500 a.,...,Eleusis,POINT (23.54150 38.04414),362-Eleusis,"[23.5423, 38.0394]",530.31,True,11.0,"[-498, -502, -504, -503, -496, -504, -496, -49...",True,"[[ἔδοχσεν, τει], [τει, βολει], [βολει, δέμοι],..."
3,6,IG I³\n6,Att. — stoich. 23/11 — ante 460 a.,160.0,— — — — — — — — — — — — —\n[․․․․․․15․․․․․․․] δ...,------------- [---------------] δραχμεισ[ι ---...,δραχμε͂ιστες τὸς ιο μενος δεμο το͂ν πόλεο οκε͂...,δραχμε͂ισι τες τὸς ιο μενος δεμο το͂ν πόλεον δ...,"{'data': ['δραχμεισι', 'τες', 'μένος', 'δεμο',...",ante 460 a.,...,Eleusis,POINT (23.54150 38.04414),362-Eleusis,"[23.5423, 38.0394]",530.31,True,,"[-461, -461, -461, -461, -461, -461, -461, -46...",True,"[[δραχμεισι, τες], [τες, μένος], [μένος, δεμο]..."
4,11,IG I³\n11,Att. — stoich. 48 — ante med. s. V a.,22.0,[χσυμμαχία καὶ ℎόρ]κ̣ο̣[ς] Ἀ̣[θ]ε̣ν̣α̣[ίον κα]...,[χσυμμαχια και ορ]κο[ς] α[θ]ενα[ιον κα]ι εγεστ...,κο Ἀεναὶ Ἐγεσταί ο͂ι ἐπρυτάνευε οΙ ον ἐ͂ρχε Ἀ...,χσυμμαχία καὶ ℎόρκος Ἀθεναίον καὶ Ἐγεσταίον ἔδ...,"{'data': ['χσυμμαχία', 'ὅρκος', 'ἀθεναίον', 'ἐ...",ante med. s. V a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,,"[-456, -456, -456, -456, -456, -456, -456, -45...",True,"[[χσυμμαχία, ὅρκος], [ὅρκος, ἀθεναίον], [ἀθενα..."


In [23]:
%%time
f = open("../data/large_files/corpus_ngrams.txt", "w", encoding="utf-8")
line = 0
ids_lines = {}
for n in range(len(GIST)):
    id = str(GIST.iloc[n]["PHI_ID"])
    try:
        ngrams = [" ".join(ngram) for ngram in GIST.iloc[n]["ngrams"]]
        f.writelines("\n".join(ngrams)+"\n")
        ids_lines[id] = (line, line+len(ngrams))
        line += len(ngrams)
    except:
        print(id)

CPU times: user 17 s, sys: 733 ms, total: 17.7 s
Wall time: 18.8 s


In [26]:
pickle.dump(ids_lines, open("../data/ids_lines.pickle", "wb"))

In [27]:
class NgramCorpus:
    def __iter__(self):
        with open("../data/large_files/corpus_ngrams.txt", "r") as f:
            lines = f.readlines()
            for phi_id in ids_list:
                for line in lines[ids_lines[phi_id][0] : ids_lines[phi_id][1]]:
                    yield line.split()

In [28]:
ids_list = ["1","4","15","17"]

In [29]:
corpus = NgramCorpus()

In [30]:
len([doc for doc in corpus])

904

In [31]:
ids_list = GIST[GIST["random_dates"].apply(lambda x: -500 < x[0] < -401)]["PHI_ID"].astype("str").tolist()
len(ids_list)

6604

In [32]:
corpus = NgramCorpus()

In [33]:
corpus_len = len([doc for doc in corpus])
corpus_len

133348

In [34]:
N_tokens = sum([el[1] for el in freqs_tups])
N_tokens

1648240

In [35]:
N_types = len(freqs_tups)
N_types

214400

In [36]:
N_types_covered = 5000
N_tokens_covered = sum([el[1] for el in freqs_tups[:N_types_covered]])
N_tokens_covered / N_tokens

0.658017036353929

In [40]:
model = Word2Vec(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, epochs=10, workers=8)

In [41]:
model.build_vocab_from_freq(word_freq=dict(freqs_tups[:N_types_covered]))

In [42]:
%%time
model.train(corpus, total_examples=corpus_len, epochs=model.epochs)

CPU times: user 11.4 s, sys: 1.93 s, total: 13.3 s
Wall time: 13.3 s


(2138767, 4125460)

In [43]:
model.wv.most_similar("θεός")

[('λαμπρός', 0.677864670753479),
 ('παρθικοῦ', 0.6593357920646667),
 ('ἔνδοξος', 0.6470581889152527),
 ('αὐτοκράτωρ', 0.6440725922584534),
 ('ἁδριανοῦ', 0.6383450031280518),
 ('μέγας', 0.6350281834602356),
 ('ἐπιφανής', 0.6306490302085876),
 ('ἀρχιερεύς', 0.6289092898368835),
 ('σεβαστός', 0.6269497275352478),
 ('εὐχαριστέω', 0.6178341507911682)]

In [46]:
model = FastText(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, epochs=5, workers=8)
model.build_vocab_from_freq(word_freq=dict(freqs_tups[:N_types_covered]))
model.train(corpus, total_examples=corpus_len, epochs=model.epochs)

(1069143, 2062730)

In [47]:
model.wv.most_similar("θεός")

[('ἐτεός', 0.9733932018280029),
 ('θνητός', 0.9689429998397827),
 ('ἀετός', 0.9668442010879517),
 ('ἰτός', 0.9665151238441467),
 ('κιβωτός', 0.9646576642990112),
 ('ἰός', 0.9639209508895874),
 ('βροτός', 0.9637974500656128),
 ('σεμνός', 0.9629839062690735),
 ('ἁγνός', 0.9618391394615173),
 ('ἐμός', 0.9589904546737671)]

# Developing century excluding models

In [51]:
for cent in range(-8, 4):
    if cent < 0:
        cent_start, cent_end = cent * 100, cent * 100 + 99
    else:
        cent_start, cent_end = cent * 100 + 1, cent * 100 + 100
    subset = GIST[GIST["random_dates"].apply(lambda x: cent_start < x[0] < cent_end)]
    print(cent_start, cent_end, (len(subset)))


-800 -701 63
-700 -601 382
-600 -501 2623
-500 -401 6604
-400 -301 9668
-300 -201 14232
-200 -101 13448
-100 -1 8443
1 100 9893
101 200 15986
201 300 11913
301 400 4897


In [52]:
cent_ids_samples = {}
for cent in range(-8, 4):
    if cent < 0:
        cent_start, cent_end = cent * 100, cent * 100 + 99
    else:
        cent_start, cent_end = cent * 100 + 1, cent * 100 + 100
    subset_ids = GIST[GIST["random_dates"].apply(lambda x: cent_start < x[0] < cent_end)]["PHI_ID"]
    subset_sample_ids = subset_ids.sample(1000, replace=True)
    cent_ids_samples[cent] = subset_sample_ids

In [53]:
# full_model
ids_list = []
for cent in cent_ids_samples.keys():
    ids_list.extend(list(cent_ids_samples[cent]))

In [54]:
len(ids_list)

12000

In [58]:
ids_list = [str(id) for id in ids_list]

In [59]:
corpus = NgramCorpus()

In [60]:
%%time
model = FastText(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, epochs=5, workers=8)
model.build_vocab_from_freq(word_freq=dict(freqs_tups[:N_types_covered]))
model.train(corpus, total_examples=corpus_len, epochs=model.epochs)

CPU times: user 20.6 s, sys: 1.79 s, total: 22.4 s
Wall time: 11.1 s


(3930072, 6515085)

In [68]:
model.wv.most_similar("θεός", topn=100)

[('ἐτεός', 0.8429304957389832),
 ('τεός', 0.7971290349960327),
 ('κολωνός', 0.7763241529464722),
 ('Σιληνός', 0.7631608247756958),
 ('ληνός', 0.7523295879364014),
 ('ἁγνός', 0.7456226944923401),
 ('σεμνός', 0.7390210628509521),
 ('οὐρανός', 0.7284296751022339),
 ('μανός', 0.7266390919685364),
 ('ἰατρός', 0.7261731028556824),
 ('ἱκανός', 0.7149150371551514),
 ('δεινός', 0.7129787802696228),
 ('ποθεινός', 0.7086061835289001),
 ('υἱωνός', 0.7081373929977417),
 ('ἑός', 0.7078588604927063),
 ('ἱλαρός', 0.7071196436882019),
 ('κενός', 0.7067139148712158),
 ('ἀγλαός', 0.7063443660736084),
 ('θεαρός', 0.7061429023742676),
 ('ζοός', 0.7020440101623535),
 ('ὀρός', 0.7020096778869629),
 ('φανός', 0.7014251351356506),
 ('λυγρός', 0.7005560994148254),
 ('θοός', 0.6989668607711792),
 ('λαός', 0.6980156898498535),
 ('ζωός', 0.6941527128219604),
 ('κωμῳδός', 0.6938192844390869),
 ('ἀγρός', 0.6933711171150208),
 ('καινός', 0.6919623017311096),
 ('τυρός', 0.6855046153068542),
 ('οὐδός', 0.68480640649795

In [62]:
model.wv.save("../data/large_files/vectors_fasttext/model_full.wv")

In [64]:
for cent_to_exclude in cent_ids_samples.keys():
    ids_list = []
    for key in cent_ids_samples.keys():
        if key != cent_to_exclude:
            ids_list.extend(list(cent_ids_samples[key]))
    ids_list = [str(id) for id in ids_list]
    model = FastText(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, epochs=5, workers=8)
    model.build_vocab_from_freq(word_freq=dict(freqs_tups[:N_types_covered]))
    print("going to train model excluding {} century".format(str(cent_to_exclude)))
    model.train(corpus, total_examples=corpus_len, epochs=model.epochs)
    model.wv.save("../data/large_files/vectors_fasttext/model_without_{}.wv".format(str(cent_to_exclude)))

going to train model excluding -8 century
going to train model excluding -7 century
going to train model excluding -6 century
going to train model excluding -5 century
going to train model excluding -4 century
going to train model excluding -3 century
going to train model excluding -2 century
going to train model excluding -1 century
going to train model excluding 0 century
going to train model excluding 1 century
going to train model excluding 2 century
going to train model excluding 3 century
