In [1]:
from gensim.corpora import Dictionary
from gensim.models import Word2Vec
import pandas as pd
import pickle

In [2]:
cgl = pd.read_json("../data/large_data/cgl.json")

In [3]:
word_freqs_vocabulary = pd.read_csv("../data/word_freqs_vocabulary.csv")

In [4]:
word_freqs_vocabulary

Unnamed: 0,word,count
0,λέγω,29114
1,ἔχω,22584
2,γίγνομαι,21737
3,πολύς,20652
4,πᾶς,19062
...,...,...
3457,στραγγουρία,50
3458,ῥύγχος,50
3459,διάκρισις,50
3460,ἀρχός,50


In [5]:
word_freq_dict = dict(zip(word_freqs_vocabulary["word"], word_freqs_vocabulary["count"]))

In [24]:
len(word_freq_dict)

3462

In [6]:
vocabulary = word_freqs_vocabulary["word"].tolist()

In [9]:
ids_lines = pickle.load(open("../data/ids_lines.pickle", "rb"))

In [10]:
class NgramCorpus(object):
    def __init__(self, ids_list, ids_lines, fname, bow=False, dct=None):
        self.ids_list = ids_list
        self.fname = fname
        self.bow  = bow
        self.dct = dct
        self.ids_lines = ids_lines
        self.len = 0
    def __len__(self):
        [el for el in self]
        return self.len
    def __iter__(self):
        with open(self.fname, "r") as f:
            lines = f.readlines()
            for id in self.ids_list:
                for line in lines[self.ids_lines[id][0] : self.ids_lines[id][1]]:
                    if bool(line.split()):
                        self.len += 1
                        if (self.bow) & (self.dct != None):
                            yield self.dct.doc2bow(line.split())
                        else:
                            yield line.split()

# Full model

In [11]:
corpus_full = NgramCorpus(cgl["doc_id"].tolist(), ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

In [13]:
cgl["lemmata_wordcount"].sum()

1828293

In [12]:
len(corpus_full)

3204442

In [14]:
def from_corpus_to_keyed_vectors(corpus):
    model = Word2Vec(vector_size=150, window=3, negative=5, ns_exponent=1, sg=0, epochs=5, workers=8)
    model.build_vocab_from_freq(word_freq=word_freq_dict)
    model.train(corpus, total_examples=len(corpus), epochs=model.epochs)
    keyed_vectors = model.wv
    return keyed_vectors

In [12]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_full)

CPU times: user 1min 10s, sys: 2.08 s, total: 1min 12s
Wall time: 45.4 s


In [13]:
keyed_vectors.save("../data/keyed_vectors_full.wv")

# Model based only on subcorpora

In [15]:
# ids_list = cgl[cgl["subcorpus"] != "c_aristotelicum"]["doc_id"].tolist()
ids_list = cgl[cgl["author_id"].isin(["tlg0086", "tlg0627", "tlg0059"])]["doc_id"].tolist()
print(len(ids_list))
corpus_subcorpora = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

124


In [16]:
cgl[cgl["author_id"].isin(["tlg0086", "tlg0627", "tlg0059"])]["lemmata_wordcount"].sum()

862087

In [15]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_subcorpora)
keyed_vectors.save("../data/keyed_vectors_subcorpora.wv")

CPU times: user 31.3 s, sys: 1.15 s, total: 32.4 s
Wall time: 19.4 s


# Model excluding all subcorpora

In [18]:
# ids_list = cgl[cgl["subcorpus"] != "c_aristotelicum"]["doc_id"].tolist()
ids_list = cgl[~cgl["author_id"].isin(["tlg0086", "tlg0627", "tlg0059"])]["doc_id"].tolist()
print(len(ids_list))
corpus_excl_subcorpora = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

247


In [19]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_excl_subcorpora)
keyed_vectors.save("../data/keyed_vectors_excl_subcorpora.wv")

CPU times: user 36.2 s, sys: 1.32 s, total: 37.5 s
Wall time: 22 s


# Model exclusing c_aristotelicum & c_platonicum

In [20]:
# ids_list = cgl[cgl["subcorpus"] != "c_aristotelicum"]["doc_id"].tolist()
ids_list = cgl[~cgl["author_id"].isin(["tlg0086", "tlg0059"])]["doc_id"].tolist()
print(len(ids_list))
corpus_excl_aristplato = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

299


In [23]:
cgl[~cgl["author_id"].isin(["tlg0086", "tlg0059"])]["lemmata_wordcount"].sum()

1161783

In [21]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_excl_aristplato)
keyed_vectors.save("../data/keyed_vectors_excl_aristplato.wv")

CPU times: user 41.3 s, sys: 1.28 s, total: 42.6 s
Wall time: 25 s


# Model excluding c_aristotelicum

In [14]:
set(cgl["subcorpus"])

{None, 'c_aristotelicum', 'c_hippocraticum', 'c_platonicum'}

In [10]:
# ids_list = cgl[cgl["subcorpus"] != "c_aristotelicum"]["doc_id"].tolist()
ids_list = cgl[cgl["author_id"] != "tlg0086"]["doc_id"].tolist()
print(len(ids_list))
corpus_excl_arist = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

336


In [20]:
cgl[cgl["author_id"] != "tlg0086"]["lemmata_wordcount"].sum()

1446704

In [16]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_excl_arist)
keyed_vectors.save("../data/keyed_vectors_excl_arist.wv")

CPU times: user 58 s, sys: 2.03 s, total: 1min
Wall time: 37.6 s


# Model excluding c_hippocraticum

In [17]:
# ids_list = cgl[cgl["subcorpus"] != "c_hippocraticum"]["doc_id"].tolist()
ids_list = cgl[cgl["author_id"] != "tlg0627"]["doc_id"].tolist()
print(len(ids_list))
corpus_excl_hipp = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

319


In [19]:
cgl[cgl["author_id"] != "tlg0627"]["lemmata_wordcount"].sum()

1632716

In [18]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_excl_hipp)
keyed_vectors.save("../data/keyed_vectors_excl_hipp.wv")

CPU times: user 1min, sys: 1.49 s, total: 1min 1s
Wall time: 37.7 s


# Model excluding c_platonicum

In [19]:
# ids_list = cgl[cgl["subcorpus"] != "c_platonicum"]["doc_id"].tolist()
ids_list = cgl[cgl["author_id"] != "tlg0059"]["doc_id"].tolist()
print(len(ids_list))
corpus_excl_plato = NgramCorpus(ids_list, ids_lines, "../data/large_data/corpus_ngrams_bydocid.txt")

334


In [21]:
cgl[cgl["author_id"] != "tlg0059"]["lemmata_wordcount"].sum()

1543372

In [20]:
%%time
keyed_vectors = from_corpus_to_keyed_vectors(corpus_excl_plato)
keyed_vectors.save("../data/keyed_vectors_excl_plato.wv")

CPU times: user 53.9 s, sys: 1.17 s, total: 55.1 s
Wall time: 34.3 s
