# Greek Word2Vec Model

Version 2.0

## Preprocessing

In [2]:
from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.utils.formatter import tonos_oxia_converter
from cltk.stop.greek.stops import STOPS_LIST
from glob import glob
from xml.etree.ElementTree import parse

In [3]:
print(STOPS_LIST)

['αὐτὸς', 'αὐτός', 'γε', 'γὰρ', 'γάρ', "δ'", 'δαὶ', 'δαὶς', 'δαί', 'δαίς', 'διὰ', 'διά', 'δὲ', 'δέ', 'δὴ', 'δή', 'εἰ', 'εἰμὶ', 'εἰμί', 'εἰς', 'εἴμι', 'κατὰ', 'κατά', 'καὶ', 'καί', 'μετὰ', 'μετά', 'μὲν', 'μέν', 'μὴ', 'μή', 'οἱ', 'οὐ', 'οὐδεὶς', 'οὐδείς', 'οὐδὲ', 'οὐδέ', 'οὐκ', 'οὔτε', 'οὕτως', 'οὖν', 'οὗτος', 'παρὰ', 'παρά', 'περὶ', 'περί', 'πρὸς', 'πρός', 'σὸς', 'σός', 'σὺ', 'σὺν', 'σύ', 'σύν', 'τε', 'τι', 'τις', 'τοιοῦτος', 'τοὶ', 'τοί', 'τοὺς', 'τούς', 'τοῦ', 'τὰ', 'τά', 'τὴν', 'τήν', 'τὶ', 'τὶς', 'τί', 'τίς', 'τὸ', 'τὸν', 'τό', 'τόν', 'τῆς', 'τῇ', 'τῶν', 'τῷ', "ἀλλ'", 'ἀλλὰ', 'ἀλλά', 'ἀπὸ', 'ἀπό', 'ἂν', 'ἄλλος', 'ἄν', 'ἄρα', 'ἐγὼ', 'ἐγώ', 'ἐκ', 'ἐξ', 'ἐμὸς', 'ἐμός', 'ἐν', 'ἐπὶ', 'ἐπί', 'ἐὰν', 'ἐάν', 'ἑαυτοῦ', 'ἔτι', 'ἡ', 'ἢ', 'ἤ', 'ὁ', 'ὃδε', 'ὃς', 'ὅδε', 'ὅς', 'ὅστις', 'ὅτι', 'ὑμὸς', 'ὑμός', 'ὑπὲρ', 'ὑπέρ', 'ὑπὸ', 'ὑπό', 'ὡς', 'ὥστε', 'ὦ', 'ξύν', 'ξὺν', 'σύν', 'σὺν', 'τοῖς', 'τᾶς']


In [None]:

###############################################
# Parsing XML.
##################################################

xml_files = glob('/Users/christinaoaks/Desktop/Nick/xml_files/*.xml')
replacer = Replacer()
corpus = []
for xml in xml_files:
    with open(xml, 'r') as x:
        tree = parse(x)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            sentences = []
            for word in sentence.iter('word'):
                for lemma in word.iter('lemma'):
                    entry = lemma.get('entry')
                    if entry is None:
                        entry = replacer.beta_code(word.get('form'))
                        sentences.append(entry)
                    elif tonos_oxia_converter(entry) not in STOPS_LIST:
                        sentences.append(entry)
            if len(sentences) > 0:
                corpus.append(sentences)
    x.close()

  
 ##################################################
# Writing to file.
##################################################  
with open('output.txt', 'w') as f:
    f.write('\n'.join(str(c) for c in corpus))
f.close()


print(corpus) #Testing purposes

## Word2Vec

In [6]:
# dependencies
from __future__ import absolute_import, division, print_function
import codecs
import glob
import multiprocessing
import os
import pprint
import re
import gensim.models.word2vec as w2v
import numpy as np
import sklearn.manifold
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import string

In [5]:
# This code defines the hyperparameter
# Dimensionality of the resulting word vectors. 
# The more vectors, the more computaionally extensive to train, but also more accurate.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length. Note that Munson (2017: 17) says context_size is optimized at 12 for Greek.
context_size = 7

# Downsample setting for frequent words.
#rate 0 and 1e-5 
#how often to use
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible. This is a random number generator
seed = 1

In [6]:
# rename this variable for Greek model
greek2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [7]:
greek2vec.build_vocab(corpus)

In [9]:
token_count = sum([len(sentence) for sentence in corpus])
print('The corpus contains {0:,} tokens'.format(token_count))

The corpus contains 3,532,909 tokens


In [10]:
%%time
#train model on sentneces, this may take a while to process
greek2vec.train(corpus, total_examples=len(corpus), epochs=100)

CPU times: user 2h 1min 10s, sys: 21.1 s, total: 2h 1min 31s
Wall time: 32min 23s


(330118452, 353290900)

## Save and Load Model

In [109]:
#save model
if not os.path.exists("trained"):
    os.makedirs("trained")

In [110]:
greek2vec.save(os.path.join("trained", "greek2vec.w2v"))

In [8]:
#load model
greek2vec = w2v.Word2Vec.load(os.path.join("trained", "greek2vec.w2v"))

## Functions

In [47]:
#vector array for token
greek2vec.wv.__getitem__('ἁγίζω')

array([-0.14877418, -0.09053723,  0.10568244,  0.10703711,  0.28395575,
        0.2265677 , -0.41538203,  0.05155652,  0.3880053 ,  0.48325354,
       -0.08635101, -0.09870056, -0.32086176, -0.0160188 , -0.11727368,
        0.26950556, -0.49433595, -0.25243756, -0.24489333, -0.3929521 ,
        0.1100295 , -0.77711046, -0.02062984, -0.29723948, -0.42253914,
       -0.34211728, -0.31976104,  0.4749692 , -0.18027478, -0.51577914,
       -0.77116185,  0.08532273,  0.38493007,  0.05441218, -0.07791433,
       -0.01799494, -0.11053267, -0.10018267, -0.19086589, -0.19001968,
       -0.1677784 , -0.02457975,  0.27660665, -0.33869603,  0.26779702,
       -0.09660979, -0.34620368,  0.5081821 , -0.00322678,  0.15015791,
        0.05227649, -0.03333354,  0.16551217,  0.44792825,  0.69342715,
       -0.85942787,  0.42797232, -0.6791218 , -0.22768825, -0.2740795 ,
       -0.255957  , -0.05556495, -0.35466513, -0.1267815 , -0.30018747,
       -0.5588178 ,  0.33104426, -0.3986837 , -0.19272874,  0.25