In [77]:
import pandas as pd
import geopandas as gpd
import re
import nltk
from csv import reader

In [2]:
GIST = gpd.read_parquet("../data/large_files/GIST_geocontexts.parquet")

In [11]:
GIST_lemmata_merged = [[l for l in inscr_lem["data"]] for inscr_lem in GIST["lemmata"].tolist()]
GIST_lemmata_merged = [l for i in GIST_lemmata_merged for l in i]
len(GIST_lemmata_merged)

2336432

In [18]:
freqs_tups = nltk.FreqDist(GIST_lemmata_merged).most_common()
freqs_tups[:100]

[('δῆμος', 22765),
 ('θεός', 18588),
 ('πόλις', 17330),
 ('ἄλλος', 13391),
 ('ἔτος', 13196),
 ('βουλή', 11093),
 ('χαίρω', 10126),
 ('υἱός', 9781),
 ('εἰμί', 8696),
 ('αὐτός', 8580),
 ('ἔχω', 8534),
 ('δίδωμι', 8222),
 ('ποιέω', 8091),
 ('οὗτος', 7890),
 ('δοκέω', 7760),
 ('μείς', 7724),
 ('πᾶς', 7696),
 ('ἀγαθός', 7680),
 ('ἀνήρ', 7435),
 ('γυνή', 7344),
 ('γίγνομαι', 7147),
 ('ἄρχω', 7028),
 ('χάρις', 6845),
 ('Ἀπόλλων', 6468),
 ('Ζεύς', 6073),
 ('ἀνατίθημι', 6008),
 ('ἱερεύς', 5849),
 ('δραχμή', 5779),
 ('ἱερόν', 5403),
 ('et', 5274),
 ('σεβαστός', 5134),
 ('βασιλεύς', 5118),
 ('ἴδιος', 5027),
 ('Ἀπολλώνιος', 5003),
 ('Διονύσιος', 4941),
 ('μέγας', 4934),
 ('αὐτοκράτωρ', 4663),
 ('τὶς', 4543),
 ('ἱερός', 4491),
 ('Καῖσαρ', 4422),
 ('πατήρ', 4393),
 ('ψήφισμα', 4384),
 ('μνήμη', 4360),
 ('νόμος', 4291),
 ('μήτηρ', 4270),
 ('εὐεργέτης', 4261),
 ('θυγάτηρ', 4133),
 ('τιμή', 4116),
 ('ἄρχων', 4067),
 ('λέγω', 4029),
 ('τόπος', 3771),
 ('τίθημι', 3663),
 ('χρηστός', 3652),
 ('τέκνον', 35

In [13]:
match_pattern = "(κατα|ὑπό|ἐκ|ἀ|ἄ|ἀντί|^)δ[ι|ί]κ(α|ά|ο|ό|ί|έ|η)(.+)?"
dik_words = [word for word in GIST_lemmata_merged if re.search(match_pattern, word)]
dik_words_df = pd.DataFrame(nltk.FreqDist(dik_words).most_common(), columns=["lemma", "count"])
dik_words_df.head(5)

Unnamed: 0,lemma,count
0,δίκη,1658
1,δίκαιος,1200
2,δικαστής,963
3,δικαιοσύνη,537
4,δικαστήριον,324


In [15]:
dik_words_df[:100]

Unnamed: 0,lemma,count
0,δίκη,1658
1,δίκαιος,1200
2,δικαστής,963
3,δικαιοσύνη,537
4,δικαστήριον,324
...,...,...
95,δικαζομένωι,4
96,δικαιαγόρα,4
97,δικαιοσύνα,4
98,δικάσηι,4


In [43]:
def get_ngrams(lemmata_data):
    lemmata_list = lemmata_data["data"]
    ngrams = []
    try:
        bigrams = list(nltk.ngrams(lemmata_list, 2))
        trigrams = list(nltk.ngrams(lemmata_list, 3, pad_right=True, pad_left=True))
        fivegrams = list(nltk.ngrams(lemmata_list, 5, pad_right=True, pad_left=True))
        ngrams_merged = bigrams + trigrams + fivegrams
        ngrams_merged = [[i for i in ngram if i is not None] for ngram in ngrams_merged]
        ngrams_merged = [ngram for ngram in ngrams_merged if (len(ngram) > 1)]
    except:
        ngrams_merged = [[]]
    return ngrams_merged


In [44]:
get_ngrams({"data" : ["word1", "word2", "word3", "word4"]})

[['word1', 'word2'],
 ['word2', 'word3'],
 ['word3', 'word4'],
 ['word1', 'word2'],
 ['word1', 'word2', 'word3'],
 ['word2', 'word3', 'word4'],
 ['word3', 'word4'],
 ['word1', 'word2'],
 ['word1', 'word2', 'word3'],
 ['word1', 'word2', 'word3', 'word4'],
 ['word1', 'word2', 'word3', 'word4'],
 ['word2', 'word3', 'word4'],
 ['word3', 'word4']]

In [48]:
%%time
GIST["ngrams"] = GIST["lemmata"].apply(get_ngrams)

CPU times: user 6.35 s, sys: 176 ms, total: 6.53 s
Wall time: 6.82 s


In [49]:
GIST.head(5)

Unnamed: 0,PHI_ID,reference,metadata,lines,text_raw,text_iphi,clean_text_conservative,clean_text_interpretive_word,lemmata,raw_date,...,TMgeo_name,geometry,name,polis_geo,polis_dist,polis?,duration,random_dates,greece?,ngrams
0,1,IG I³\n1,Att. — Ath.: Akr. — stoich. 35 — c. 510-500 a....,12.0,ἔδοχσεν το͂ι δέμοι· τ̣[ὸς ἐ Σ]αλαμ̣[ῖνι κλερόχ...,εδοχσεν τοι δεμοι τ[ος ε σ]αλαμ[ινι κλεροχ]ος ...,ἔδοχσεν το͂ι δέμοι ταλαμος οἰκε͂ν ἐᾶ Σαλαμῖνι ...,ἔδοχσεν το͂ι δέμοι τὸς ἐ Σαλαμῖνι κλερόχος οἰκ...,"{'data': ['ἔδοχσεν', 'δέμοι', 'Σαλαμίς', 'κλερ...",c. 510-500 a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,13.0,"[-510, -504, -510, -507, -506, -500, -508, -50...",True,"[[ἔδοχσεν, δέμοι], [δέμοι, Σαλαμίς], [Σαλαμίς,..."
1,4,IG I³\n4,Att. — stoich. 38 — 485/4 a.,56.0,[․․․․․․․․․․․․․․․․․․38․․․․․․․․․․․․․․․․․․]\n[․․․...,[--------------------------------------] [----...,δέ τις αν ἒ φρορὰν μ ντέκοντα δχμὰς τ ας ℎες π...,ἐὰν δέ τις αν ἒ φρορὰν μὲ πεντέκοντα δραχμὰς τ...,"{'data': ['τὶς', 'φρορὰν', 'πεντέκοντα', 'δραχ...",485/4 a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,2.0,"[-485, -485, -485, -484, -485, -484, -485, -48...",True,"[[τὶς, φρορὰν], [φρορὰν, πεντέκοντα], [πεντέκο..."
2,5,IG I³\n5,Att. — c. 500 a.,6.0,[ἔδοχσε]ν [⋮ τε͂ι βολε͂ι] ⋮ καὶ [τ]ο͂ι δέμοι ⋮...,[εδοχσε]ν [ τει βολει] και [τ]οι δεμοι οτε παρ...,ν καὶ ο͂ι δέμοι ℎότε Παραιβάτες λεια θν τὸς ℎι...,ἔδοχσεν τε͂ι βολε͂ι καὶ το͂ι δέμοι ℎότε Παραιβ...,"{'data': ['ἔδοχσεν', 'τει', 'βολει', 'δέμοι', ...",c. 500 a.,...,Eleusis,POINT (23.54150 38.04414),362-Eleusis,"[23.5423, 38.0394]",530.31,True,11.0,"[-498, -502, -504, -503, -496, -504, -496, -49...",True,"[[ἔδοχσεν, τει], [τει, βολει], [βολει, δέμοι],..."
3,6,IG I³\n6,Att. — stoich. 23/11 — ante 460 a.,160.0,— — — — — — — — — — — — —\n[․․․․․․15․․․․․․․] δ...,------------- [---------------] δραχμεισ[ι ---...,δραχμε͂ιστες τὸς ιο μενος δεμο το͂ν πόλεο οκε͂...,δραχμε͂ισι τες τὸς ιο μενος δεμο το͂ν πόλεον δ...,"{'data': ['δραχμεισι', 'τες', 'μένος', 'δεμο',...",ante 460 a.,...,Eleusis,POINT (23.54150 38.04414),362-Eleusis,"[23.5423, 38.0394]",530.31,True,,"[-461, -461, -461, -461, -461, -461, -461, -46...",True,"[[δραχμεισι, τες], [τες, μένος], [μένος, δεμο]..."
4,11,IG I³\n11,Att. — stoich. 48 — ante med. s. V a.,22.0,[χσυμμαχία καὶ ℎόρ]κ̣ο̣[ς] Ἀ̣[θ]ε̣ν̣α̣[ίον κα]...,[χσυμμαχια και ορ]κο[ς] α[θ]ενα[ιον κα]ι εγεστ...,κο Ἀεναὶ Ἐγεσταί ο͂ι ἐπρυτάνευε οΙ ον ἐ͂ρχε Ἀ...,χσυμμαχία καὶ ℎόρκος Ἀθεναίον καὶ Ἐγεσταίον ἔδ...,"{'data': ['χσυμμαχία', 'ὅρκος', 'ἀθεναίον', 'ἐ...",ante med. s. V a.,...,Athenai,POINT (23.72399 37.97275),361-Athenai,"[23.7278, 37.9778]",653.43,True,,"[-456, -456, -456, -456, -456, -456, -456, -45...",True,"[[χσυμμαχία, ὅρκος], [ὅρκος, ἀθεναίον], [ἀθενα..."


In [132]:
%%time
f = open("../data/large_files/corpus_ngrams.csv", "w", encoding="utf-8")
for n in range(len(GIST)):
    id = str(GIST.iloc[n]["PHI_ID"])
    try:
        ngrams = [" ".join(ngram) + ",{}".format(id) for ngram in GIST.iloc[n]["ngrams"]]
        f.writelines("\n".join(ngrams)+"\n")
    except:
        print(id)

CPU times: user 18.7 s, sys: 444 ms, total: 19.1 s
Wall time: 19.5 s


In [116]:
ids_list = ["1","3","5","7"]

In [135]:
class CSVCorpus:
    def __iter__(self):
        with open("../data/large_files/corpus_ngrams.csv", "r") as read_obj:
            csv_reader = reader(read_obj)
            for row in csv_reader:
                try:
                    if row[1] in ids_list:
                        yield row[0]
                except:
                    pass

In [136]:
GIST = GIST[GIST["random_dates"].notnull()]
len(GIST)

106898

In [141]:
ids_list = GIST[GIST["random_dates"].apply(lambda x: -500 < x[0] < -401)]["PHI_ID"].astype("str").tolist()
len(ids_list)

6604

In [142]:
ids_list[:10]

['4', '5', '6', '11', '14', '15', '16', '17', '22', '29']

In [143]:
corpus = CSVCorpus()

In [144]:
len([doc for doc in corpus])

133348