# TF-IDF

We'll rely on term frequency times inverted document frequency to measure meaningful similarity between documents. Let's start by generating a matrix for the separate constituent parts of _Stjórn_.

In [41]:
import os,glob,json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
titles = ['prologue', 'introduction', 'gn', 'ex', 'lv', 'nm', 'dt', 'ios', 'idc', 'rt', '1sm', '2sm', '3rg', '4rg']
tokens = []
for title in titles:
    with open(f"nlp/{title}.txt") as raw:
        document = raw.read().replace('\n', ' ')
        tokens.extend(document.split())

work_indices = {
    'stjorn1': (650,124417),
    'stjorn2': (124417,147678),
    'stjorn3': (147678,156943,160719),
    'stjorn4': (156943,160719)
}

stjorn = dict()
for _work, _range in work_indices.items():
    if len(_range) == 2:
        stjorn[_work] = ' '.join(tokens[_range[0]:_range[1]])
    else:
        stjorn[_work] = ' '.join(tokens[_range[0]:_range[1]] + tokens[_range[2]:])

menota = dict()
for text in glob.glob('../menota/dipl/*txt'):
    ref = os.path.basename(text).replace('.txt', '')
    with open(text) as doc:
        menota[ref] = doc.read().replace('\n', '')

In [55]:
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(stjorn.values())
df = pd.DataFrame(cosine_similarity(model), stjorn.keys(), stjorn.keys())
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4
stjorn1,1.0,0.811713,0.508459,0.842683
stjorn2,0.811713,1.0,0.394915,0.833537
stjorn3,0.508459,0.394915,1.0,0.46975
stjorn4,0.842683,0.833537,0.46975,1.0


This tells us _Stjórn III_ is the most distinct of the three. Remarkably, compared to _Stjórn IV_, which covers some of the same ground, it is the least similar constituent text. Perhaps further analysis can tell us how.

First let's add _Konungs skuggsjá_ from Menota, as well as Unger's own edition of the _Norwegian Homily Book_. Fingers crossed that we have got the normalization standard of the former to approach Unger's methods reasonably well.

In [56]:
nhb = ''
for text in glob.glob('../nhb/nlp/*txt'):
    with open(text) as doc:
        nhb = nhb + doc.read().replace('\n', '')
stjorn_plus = []
for v in stjorn.values():
    stjorn_plus.append(v)
stjorn_plus.extend([menota['nks235g_konungs_skuggsja'], nhb])
model = vectorizer.fit_transform(stjorn_plus)
df = pd.DataFrame(cosine_similarity(model), list(stjorn.keys()) + ['ks', 'nhb'], list(stjorn.keys()) + ['ks', 'nhb'])
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,ks,nhb
stjorn1,1.0,0.786536,0.523933,0.812035,0.242526,0.387017
stjorn2,0.786536,1.0,0.402757,0.798033,0.228456,0.330674
stjorn3,0.523933,0.402757,1.0,0.474972,0.696958,0.828841
stjorn4,0.812035,0.798033,0.474972,1.0,0.24424,0.38524
ks,0.242526,0.228456,0.696958,0.24424,1.0,0.711172
nhb,0.387017,0.330674,0.828841,0.38524,0.711172,1.0


Next, let's model all of Menota along with Stjórn. Perhaps we'll leave Unger's _Homily Book_ in alongside the Menota edition, just for comparison's sake.

In [57]:
corpus = []
titles = []
for k,v in stjorn.items():
    titles.append(k)
    corpus.append(v)
titles.append('nhb')
corpus.append(nhb)
for k,v in menota.items():
    titles.append(k)
    corpus.append(v)
model = vectorizer.fit_transform(corpus)
df = pd.DataFrame(cosine_similarity(model), titles, titles)
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,nhb,am132_egils_saga,am162btheta_njals_saga,holmPerg30_langslog,am1056IX_konungs_skuggsja_fragment,am78_kristinrettir,...,am132_hallfredar_saga,am35_heimskringla1,am242_codex_wormianus,dg8II_olafs_saga,am28_codex_runicus,holmPerg34_boejarlog,dg8I_landslog,nraNorrFragm55A_hakonar_saga,nraNorrFragm52_olafs_saga_helga_hin_elzta,holmPerg6_barlaams_saga
stjorn1,1.0,0.73548,0.438539,0.707385,0.294787,0.708617,0.272811,0.446772,0.436763,0.176707,...,0.625671,0.244285,0.749087,0.2295,0.297472,0.587034,0.55046,0.407644,0.18501,0.241481
stjorn2,0.73548,1.0,0.336237,0.679995,0.240394,0.655126,0.21432,0.455732,0.394065,0.176005,...,0.593818,0.202023,0.667435,0.207135,0.266576,0.606945,0.543774,0.345333,0.166041,0.215892
stjorn3,0.438539,0.336237,1.0,0.377355,0.784795,0.422176,0.725551,0.250823,0.225803,0.56488,...,0.357697,0.718933,0.429172,0.750518,0.115969,0.329024,0.319228,0.551919,0.613433,0.811443
stjorn4,0.707385,0.679995,0.377355,1.0,0.259327,0.590141,0.235221,0.383132,0.336647,0.162133,...,0.511993,0.214786,0.602725,0.201968,0.228585,0.486446,0.465006,0.352068,0.163619,0.193018
nhb,0.294787,0.240394,0.784795,0.259327,1.0,0.347112,0.64049,0.313746,0.22571,0.64642,...,0.27243,0.67725,0.394676,0.77859,0.080608,0.365497,0.375388,0.430705,0.61373,0.807097
am132_egils_saga,0.708617,0.655126,0.422176,0.590141,0.347112,1.0,0.342283,0.508724,0.435276,0.261079,...,0.761098,0.361849,0.777119,0.351744,0.269242,0.681395,0.580578,0.547821,0.274086,0.318465
am162btheta_njals_saga,0.272811,0.21432,0.725551,0.235221,0.64049,0.342283,1.0,0.174959,0.136584,0.473842,...,0.290386,0.634953,0.277926,0.636015,0.067668,0.268394,0.201655,0.459958,0.555417,0.693643
holmPerg30_langslog,0.446772,0.455732,0.250823,0.383132,0.313746,0.508724,0.174959,1.0,0.355087,0.460249,...,0.429523,0.204895,0.555129,0.279685,0.303609,0.541531,0.600896,0.260426,0.165773,0.236864
am1056IX_konungs_skuggsja_fragment,0.436763,0.394065,0.225803,0.336647,0.22571,0.435276,0.136584,0.355087,1.0,0.190931,...,0.378282,0.160909,0.511174,0.177149,0.166063,0.435982,0.38334,0.202544,0.142472,0.185208
am78_kristinrettir,0.176707,0.176005,0.56488,0.162133,0.64642,0.261079,0.473842,0.460249,0.190931,1.0,...,0.197623,0.542809,0.304146,0.638254,0.181734,0.390656,0.566068,0.345317,0.490918,0.658486


That's not a good sign, two editions of the _Norwegian Homily Book_ doing no better than `0.54` similarity...