# Doc2Vec

TF-IDF is limited by exact spelling. If our orthography is not too unpredictable, or else if our corpus is sufficiently large to gather it into patterns, perhaps word embeddings, scaled up to document embeddings, can offer a deeper insight into document similarity by learning about the similar contexts of different terms.

In [21]:
import os,glob,json,multiprocessing
import pandas as pd
import numpy as np
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
def normalize(target):
    # This dict limits orthographical variation beyond the rule sets
    # of stjorn-extract.ipynb and menota-extract.ipynb:
    matrix = {
        'j': 'i',
        'v': 'u',
        # For embeddings we may be better off keeping the d-ð distinction:
        #'ð': 'þ',
        'ð': 'd',
        'þ': 'd',
        'á': 'a',
        'ǽ': 'æ',
        'ę': 'æ',
        'é': 'e',
        'í': 'i',
        'ó': 'o',
        'ú': 'u',
        'ý': 'y',
        'ǿ': 'ø',
        'k': 'c', # rather than vice versa, because of Latin (e.g. Lucifer)
        '[': '',
        ']': ''
        }
    for k,v in matrix.items():
        target = target.replace(k, v)
    return target

titles = ['prologue', 'introduction', 'gn', 'ex', 'lv', 'nm', 'dt', 'ios', 'idc', 'rt', '1sm', '2sm', '3rg', '4rg']
tokens = []
for title in titles:
    with open(f"nlp/{title}.txt") as raw:
        document = normalize(raw.read().replace('\n', ' '))
        tokens.extend(document.split())

work_indices = {
    'stjorn1': (650,124417),
    'stjorn2': (124417,147678),
    'stjorn3': (147678,156943,160719),
    'stjorn4': (156943,160719)
}

stjorn = dict()
for _work, _range in work_indices.items():
    if len(_range) == 2:
        stjorn[_work] = (tokens[_range[0]:_range[1]])
    else:
        stjorn[_work] = tokens[_range[0]:_range[1]] + tokens[_range[2]:]

menota = dict()
for text in glob.glob('../menota/dipl/*txt'):
    ref = os.path.basename(text).replace('.txt', '')
    with open(text) as doc:
        # We'll subject Menota to the same normalization standard as Stjórn:
        menota[ref] = normalize(doc.read().replace('\n', '')).split()

num_cores = multiprocessing.cpu_count()


In [23]:
data = stjorn | menota
tagged_data = [TaggedDocument(words=v, tags=[k]) for k,v in data.items()]
model = Doc2Vec(
    vector_size=20, 
    min_count=2, 
    epochs=50, 
    workers=num_cores, 
    dm=1,               # DM is more context-sensitive, PV-DBOW is more efficient
    dbow_words=0)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [24]:
for work in work_indices.keys():
    inferred_vector = model.infer_vector(data[work])
    sims = model.dv.most_similar([inferred_vector], topn=36)
    print(f"{work}:")
    print("------------------------")
    for i in sims[1:]:
        print(f"{i[0]}    {str(round(i[1], 5))}")
    print("\n")

stjorn1:
------------------------
nraNorrFragm60A_stjorn    0.86706
nraNorrFragm58B_konungs_skuggsja    0.78795
nraNorrFragm79_mariu_saga    0.70608
nraNorrFragm66_thomass_saga    0.70143
nraNorrFragm80_pals_saga    0.68697
stjorn4    0.68555
nraNorrFragm78_mariu_saga    0.65351
nraNorrFragm67_thomass_saga    0.65231
stjorn2    0.64601
am383I_thorlaks_saga    0.63137
nraNorrFragm59_rimbegla    0.60485
nraNorrFragm60C_stjorn    0.60119
nraNorrFragm77_dialogar    0.58918
stjorn3    0.57158
nraNorrFragm75_kross_saga    0.56235
nraNorrFragm71_gregors_saga_pafa    0.56202
nraNorrFragm63_karlamagnuss_saga    0.53433
nraNorrFragm72x76_dialogar    0.50511
nraNorrFragm57_jons_saga_helga    0.49262
holmPerg17_thomass_saga    0.49109
nraNorrFragm69_nikulass_saga    0.49066
nraNorrFragm64_barlaams_saga    0.47487
nraNorrFragm55A_hakonar_saga    0.42853
nraNorrFragm62_karlamagnuss_saga    0.42285
holmPerg6_barlaams_saga    0.41972
nraNorrFragm54_sverris_saga    0.41771
am178_thidreks_saga    0.4131