# Word2Vec Corpus Analysis and Cosine Similarity

This script takes the original Diorisis xml files, performs the preprocessing tasks for corpus analysis, trains the data with the Word2Vec model, and performs a dimensionality reduction using t-SNE. Download the master folder (ijl_greek_kinship_terms-master) and drag the folder onto your desktop.


In [None]:
from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.utils.formatter import tonos_oxia_converter

from glob import glob
from xml.etree.ElementTree import parse

import re
import os
import os.path
import pandas as pd

## Step 1: Construct the Corpus
This code takes the Koine Greek texts sourced from Diorisis Corpus, taking the lemma entries and appending them into a list. The relevant xml files can be found at https://figshare.com/articles/dataset/The_Diorisis_Ancient_Greek_Corpus/6187256. A list of the Diorisis texts used in this corpus analysis can be found at https://github.com/lisni946/ijl_greek_kinship_terms/blob/main/greek_corpus_list.xlsx.

### Stopwords

This csv file lists all the stopwords we wish to exclude from the corpus. The file can be found at https://github.com/lisni946/ijl_greek_kinship_terms/blob/main/new_stops.csv.

In [None]:
new_stops = os.path.join("Desktop/ijl_greek_kinship_terms-master", "new_stops.csv")

f = open(new_stops)

X = pd.read_csv(f, delimiter=",", )

X.head()
df = pd.DataFrame(X, columns=['Add Stops'])
new_list = df['Add Stops'].values.tolist()

## for testing purposes ##
# print(new_list)

### XML Parser

In [None]:
# Parsing XML

xml_files = glob('Desktop/greek_corpus/*.xml') #you will have to create this corpus folder yourself from the Diorisis dataset
replacer = Replacer()
corpus = []
for xml in xml_files:
    with open(xml, 'r') as x:
        tree = parse(x)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            sentences = []
            for word in sentence.iter('word'):
                for lemma in word.iter('lemma'):
                    entry = lemma.get('entry')
                    if entry is None:
                        entry = replacer.beta_code(word.get('form'))
                        sentences.append(entry)
                    elif tonos_oxia_converter(entry) not in new_list:
                        sentences.append(entry)
            if len(sentences) > 0:
                corpus.append(sentences)
    x.close()


## print(corpus) ## Testing purposes ##

## Step 2: Run the Word2Vec Model
The following scripts takes the preprocessed corpus and trains the data with Word2Vec.

In [None]:
# dependencies
from __future__ import absolute_import, division, print_function
import codecs
import glob
import multiprocessing
import gensim.models.word2vec as w2v
import sklearn.manifold

In [None]:
# This code defines the hyperparameter
# Dimensionality of the resulting word vectors.
num_features = 500

# Minimum word count threshold.
min_word_count = 10

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 2

# Downsample setting for frequent words.
#rate 0 and 1e-5 
#how often to use
downsampling = 1e-3

# Seed for the Random Number Generator, to make the results reproducible.
seed = 1

In [None]:
greek2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
greek2vec.build_vocab(corpus)

In [None]:
token_count = sum([len(sentence) for sentence in corpus])
print('The corpus contains {0:,} tokens'.format(token_count))

In [None]:
%%time
#train model on sentneces, this may take a while to process
greek2vec.train(corpus, total_examples=len(corpus), epochs=100)

In [None]:
#save model
if not os.path.exists("Desktop/ijl_greek_kinship_terms-master"):
    os.makedirs("Desktop/ijl_greek_kinship_terms-master")

In [None]:
greek2vec.save(os.path.join("Desktop/ijl_greek_kinship_terms-master", "greek2vec.w2v"))

In [None]:
#load model
greek2vec = w2v.Word2Vec.load(os.path.join("Desktop/ijl_greek_kinship_terms-master", "greek2vec.w2v"))

## Step 3: Similarity Metric

In [None]:
greek2vec.most_similar('θυγάτηρ', topn=20)

In [None]:
greek2vec.most_similar('πατήρ', topn=20)

In [None]:
greek2vec.most_similar('μήτηρ', topn=20)