# TF-IDF

We'll rely on term frequency times inverted document frequency to measure meaningful similarity between documents. Let's start by generating a matrix for the separate constituent parts of _Stjórn_.

In [1]:
import os,glob,json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def normalize(target):
    # This dict further standardizes the text beyond the rule set of stjorn-extract.ipynb:
    matrix = {
        'j': 'i',
        'v': 'u',
        'ð': 'þ',
        'á': 'a',
        'ǽ': 'æ',
        'é': 'e',
        'í': 'i',
        'ó': 'o',
        'ú': 'u',
        'ý': 'y',
        'ǿ': 'ø',
        'k': 'c',
        '[': '',
        ']': ''
        }
    for k,v in matrix.items():
        target = target.replace(k, v)
    return target

titles = ['prologue', 'introduction', 'gn', 'ex', 'lv', 'nm', 'dt', 'ios', 'idc', 'rt', '1sm', '2sm', '3rg', '4rg']
tokens = []
for title in titles:
    with open(f"nlp/{title}.txt") as raw:
        document = raw.read().replace('\n', ' ')
        tokens.extend(document.split())

work_indices = {
    'stjorn1': (650,124417),
    'stjorn2': (124417,147678),
    'stjorn3': (147678,156943,160719),
    'stjorn4': (156943,160719)
}

stjorn = dict()
for _work, _range in work_indices.items():
    if len(_range) == 2:
        stjorn[_work] = normalize(' '.join(tokens[_range[0]:_range[1]]))
    else:
        stjorn[_work] = normalize(' '.join(tokens[_range[0]:_range[1]] + tokens[_range[2]:]))

menota = dict()
for text in glob.glob('../menota/dipl/*txt'):
    ref = os.path.basename(text).replace('.txt', '')
    with open(text) as doc:
        # We'll subject Menota to the same normalization standard as Stjórn:
        menota[ref] = normalize(doc.read().replace('\n', ''))

In [3]:
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(stjorn.values())
df = pd.DataFrame(cosine_similarity(model), stjorn.keys(), stjorn.keys())
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4
stjorn1,1.0,0.811733,0.90377,0.842752
stjorn2,0.811733,1.0,0.797618,0.833766
stjorn3,0.90377,0.797618,1.0,0.835339
stjorn4,0.842752,0.833766,0.835339,1.0


After eliminating vowel length and the þ/ð distinction, these are now all pretty similar to one another, with the biggest difference between _Stjórn II_ and _III_.

Now let's first add _Konungs skuggsjá_ from Menota, as well as Unger's own edition of the _Norwegian Homily Book_. Fingers crossed that we have got the normalization standard of the former to approach Unger's methods reasonably well.

In [4]:
# We want only those parts of Unger's NHB matched in Menota:
nhb_titles = ['alcuin', 'hom', 'olafr', 'visio', 'paternoster', 'anhang1']
nhb = ''
for title in nhb_titles:
    filepath = f'../nhb/nlp/{title}.txt'
    with open(filepath) as doc:
        nhb = nhb + normalize(doc.read().replace('\n', ''))
stjorn_plus = []
for v in stjorn.values():
    stjorn_plus.append(v)
stjorn_plus.extend([menota['nks235g_konungs_skuggsja'], nhb])
model = vectorizer.fit_transform(stjorn_plus)
df = pd.DataFrame(cosine_similarity(model), list(stjorn.keys()) + ['ks', 'nhb'], list(stjorn.keys()) + ['ks', 'nhb'])
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,ks,nhb
stjorn1,1.0,0.794809,0.889323,0.812473,0.714273,0.755032
stjorn2,0.794809,1.0,0.779844,0.803534,0.700278,0.713765
stjorn3,0.889323,0.779844,1.0,0.803896,0.741964,0.830661
stjorn4,0.812473,0.803534,0.803896,1.0,0.665862,0.72318
ks,0.714273,0.700278,0.741964,0.665862,1.0,0.754185
nhb,0.755032,0.713765,0.830661,0.72318,0.754185,1.0


_Stjórn III_ and _Konungs skuggsjá_ share material cognate within the vernacular, so this connection standing out as the strongest between _Konungs skuggsjá_ and the constituent parts of _Stjórn_ comes as no surprise; if anything, the difference in match with the other parts is rather small. In fact, the _Norwegian Homily Book_ has a higher match with _Stjórn III_ than _Konungs skuggsjá_ does.

Next, let's model all of Menota along with Stjórn. Perhaps we'll leave Unger's _Homily Book_ in alongside the Menota edition, just for comparison's sake.

In [5]:
corpus = []
titles = []
for k,v in stjorn.items():
    titles.append(k)
    corpus.append(v)
titles.append('nhb')
corpus.append(nhb)
for k,v in menota.items():
    titles.append(k)
    corpus.append(v)
model = vectorizer.fit_transform(corpus)
df = pd.DataFrame(cosine_similarity(model), titles, titles)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,nhb,nraNorrFragm75_kross_saga,am132_egils_saga,am162btheta_njals_saga,nraNorrFragm64_barlaams_saga,nraNorrFragm81A_benedikts_regla,am1056IX_konungs_skuggsja_fragment,am78_kristinrettir,am63_heimskringla3,dg4-7_strengleikar,am132_droplaugasona_saga,am132_kormaks_saga,nraNorrFragm72x76_dialogar,nraNorrFragm53_haralds_saga_hardrada,am132_finnboga_saga,nraNorrFragm70_agotu_saga,nraNorrFragm62_karlamagnuss_saga,nraNorrFragm60A_stjorn,am132_fostbraedra_saga,lbsFragm82_olafs_saga_helga,nraNorrFragm58B_konungs_skuggsja,nraNorrFragm60C_stjorn,holmPerg30_landslog,am619_norwegian_homily_book,nhb_am619,nraNorrFragm57_jons_saga_helga,nraNorrFragm69_nikulass_saga,am56_landslog,wolfAug9-10_egils_saga,nraNorrFragm66_thomass_saga,holmPerg17_thomass_saga,am383I_thorlaks_saga,holmPerg4_thidreks_saga,am132_njals_saga,am36_heimskringla2,am544_voluspa,am162bkappa_njals_saga,am305_landslog,nraNorrFragm58C_konungs_skuggsja,am132_olkofra_thattr,konungs_skuggsja_am243ba,nraNorrFragm54_sverris_saga,nraNorrFragm55B_hakonar_saga,nraNorrFragm79_mariu_saga,gks2365_voluspa,am243balpha_konungs_skuggsja,nraNorrFragm51_fagrskinna,am132_viga-glums_saga,am279a_gragas,am677_gregory,am132_laxdoela_saga,am302_landslog,am178_thidreks_saga,nraNorrFragm81B_benedikts_regla,am132_bandamanna_saga,nraNorrFragm71_gregors_saga_pafa,gregory_am677,am655_laeknisbok,am519a_alexanders_saga,holmPerg34_landslog,am162balpha_njals_saga,am113_islendingabok,nraNorrFragm7_landslog,nraNorrFragm67_thomass_saga,nraNorrFragm56_thorgils_saga,nks235g_konungs_skuggsja,am132_hallfredar_saga,am35_heimskringla1,am242_codex_wormianus,nraNorrFragm78_mariu_saga,konungs_skuggsja_fragment_am1056xi,dg8II_olafs_saga,nraNorrFragm80_pals_saga,nraNorrFragm63_karlamagnuss_saga,nraNorrFragm77_dialogar,am28_codex_runicus,holmPerg34_boejarlog,dg8I_landslog,nraNorrFragm60B_stjorn,nraNorrFragm55A_hakonar_saga,skbA120_marys_complaint,nraNorrFragm59_rimbegla,nraNorrFragm65_floress_saga,nraNorrFragm52_olafs_saga_helga_hin_elzta,holmPerg6_barlaams_saga,nraNorrFragm68_brendanuss_saga,nraNorrFragm61_karlamagnuss_saga,nraNorrFragm58A_konungs_skuggsja
stjorn1,1.0,0.675387,0.783339,0.649742,0.643941,0.257274,0.672527,0.571604,0.500302,0.222923,0.368637,0.421182,0.65111,0.728837,0.581742,0.47661,0.494489,0.391262,0.575325,0.233495,0.598066,0.357409,0.627533,0.573677,0.473834,0.484881,0.36204,0.632119,0.632119,0.605251,0.623642,0.424192,0.652068,0.602263,0.628994,0.424009,0.690261,0.676023,0.671307,0.310597,0.519099,0.413216,0.561085,0.506557,0.555367,0.668246,0.431973,0.444954,0.328881,0.555367,0.38615,0.65545,0.34124,0.5121,0.661238,0.401987,0.149346,0.356428,0.600877,0.36789,0.5121,0.456324,0.663204,0.554184,0.497871,0.36719,0.425358,0.669585,0.465424,0.545661,0.55214,0.664571,0.710394,0.638211,0.368637,0.618251,0.610497,0.64138,0.59338,0.227246,0.520304,0.470037,0.293207,0.512047,0.195796,0.432415,0.453895,0.606203,0.726365,0.488334,0.485173,0.583016
stjorn2,0.675387,1.0,0.6543,0.628096,0.573106,0.252889,0.604286,0.493187,0.458971,0.198467,0.325867,0.413019,0.56886,0.627843,0.540389,0.438563,0.442589,0.343132,0.500694,0.214664,0.544635,0.280767,0.564768,0.507675,0.431108,0.391072,0.373915,0.575543,0.575543,0.529738,0.518355,0.411399,0.592197,0.488766,0.463867,0.341887,0.642173,0.622512,0.578432,0.285286,0.485023,0.399338,0.527731,0.482388,0.504161,0.616213,0.404941,0.391499,0.312157,0.504161,0.358301,0.596529,0.31337,0.478295,0.598553,0.389952,0.171318,0.325514,0.558256,0.347516,0.478295,0.425943,0.578361,0.563611,0.478745,0.309535,0.412559,0.579681,0.382934,0.507153,0.515963,0.574245,0.608711,0.559493,0.325867,0.557188,0.541039,0.603547,0.533864,0.197462,0.541053,0.464794,0.280207,0.434213,0.165643,0.412782,0.42181,0.557772,0.66852,0.444856,0.457296,0.539522
stjorn3,0.783339,0.6543,1.0,0.617894,0.762637,0.283704,0.78307,0.650753,0.558589,0.266439,0.404821,0.492708,0.739716,0.817489,0.672118,0.550083,0.56312,0.476378,0.649573,0.274033,0.686795,0.348944,0.700605,0.692158,0.50459,0.590691,0.437722,0.745902,0.745902,0.68561,0.685983,0.500629,0.769761,0.642806,0.666143,0.465779,0.810757,0.778768,0.769229,0.343631,0.560584,0.462857,0.619328,0.606608,0.667085,0.743357,0.4876,0.50879,0.366986,0.667085,0.415298,0.751165,0.385216,0.564045,0.764573,0.453937,0.174459,0.43438,0.694104,0.439287,0.564045,0.491543,0.763251,0.610338,0.524106,0.411699,0.513517,0.753302,0.533901,0.596078,0.649902,0.748713,0.789271,0.706605,0.404821,0.735791,0.676145,0.68045,0.671427,0.240641,0.57151,0.533768,0.360625,0.592871,0.20886,0.432503,0.501551,0.694161,0.81222,0.567782,0.571314,0.657534
stjorn4,0.649742,0.628096,0.617894,1.0,0.513805,0.230179,0.559022,0.447134,0.375959,0.180933,0.272762,0.350643,0.520726,0.544924,0.484031,0.383179,0.400121,0.320327,0.439719,0.173545,0.466182,0.233793,0.508432,0.466889,0.356017,0.343311,0.31987,0.506916,0.506916,0.462655,0.45902,0.351501,0.5523,0.421557,0.408693,0.288173,0.536631,0.544482,0.534256,0.253099,0.440725,0.328221,0.457387,0.432524,0.45068,0.546167,0.36421,0.329521,0.274447,0.45068,0.296533,0.511,0.309426,0.403167,0.528178,0.324086,0.132745,0.285242,0.470404,0.325235,0.403167,0.348707,0.511536,0.439583,0.41039,0.281953,0.350018,0.504008,0.347152,0.416932,0.449526,0.525228,0.552263,0.488445,0.272762,0.482879,0.476938,0.513023,0.477009,0.164386,0.418617,0.396528,0.226389,0.421402,0.139558,0.339122,0.338029,0.480266,0.541918,0.411,0.37631,0.462472
nhb,0.643941,0.573106,0.762637,0.513805,1.0,0.337192,0.717739,0.565319,0.567094,0.335459,0.429224,0.624957,0.662927,0.78086,0.605047,0.503233,0.603109,0.382517,0.582185,0.294388,0.596494,0.323369,0.642046,0.598868,0.440414,0.464147,0.550581,0.976362,0.976362,0.657703,0.642461,0.635986,0.7083,0.622191,0.677756,0.444275,0.754626,0.701947,0.703404,0.292023,0.519597,0.572603,0.694329,0.582506,0.717411,0.661768,0.459571,0.533513,0.344703,0.717411,0.418853,0.701659,0.423092,0.590444,0.700561,0.559537,0.158522,0.534792,0.6653,0.497157,0.590444,0.488505,0.770285,0.700997,0.456216,0.406845,0.620311,0.760174,0.466316,0.614846,0.575012,0.683246,0.785245,0.691355,0.429224,0.793009,0.699008,0.620641,0.724561,0.214507,0.647117,0.640856,0.277292,0.471814,0.18168,0.447906,0.497818,0.664136,0.813248,0.511812,0.513196,0.736913
nraNorrFragm75_kross_saga,0.257274,0.252889,0.283704,0.230179,0.337192,1.0,0.303395,0.217684,0.207121,0.145317,0.158555,0.243036,0.278221,0.329998,0.270399,0.209976,0.255597,0.161348,0.22408,0.122503,0.231285,0.115639,0.278876,0.253248,0.177433,0.171697,0.217371,0.337262,0.337262,0.266531,0.234292,0.248753,0.288901,0.256341,0.233358,0.15206,0.287578,0.274895,0.2912,0.148608,0.202797,0.22911,0.308023,0.243013,0.262678,0.270271,0.19907,0.262447,0.190406,0.262678,0.166349,0.274071,0.188244,0.255247,0.297355,0.232066,0.080048,0.186419,0.263019,0.228101,0.255247,0.209847,0.328862,0.312086,0.175479,0.152391,0.248734,0.307281,0.191395,0.242928,0.225824,0.288116,0.343728,0.271625,0.158555,0.277908,0.313268,0.250082,0.330743,0.077812,0.280113,0.252454,0.108307,0.189158,0.060265,0.189785,0.164957,0.272054,0.312152,0.215444,0.199015,0.328909
am132_egils_saga,0.672527,0.604286,0.78307,0.559022,0.717739,0.303395,1.0,0.659926,0.504258,0.260971,0.372673,0.477493,0.740505,0.770168,0.73656,0.590667,0.600836,0.520166,0.678709,0.240849,0.620717,0.295586,0.756495,0.708119,0.405574,0.530092,0.427132,0.706959,0.706959,0.693072,0.676812,0.482081,0.920556,0.596984,0.597241,0.479174,0.754465,0.823766,0.770477,0.316994,0.583153,0.458753,0.651591,0.66577,0.644661,0.720877,0.529309,0.493764,0.3558,0.644661,0.414497,0.798609,0.423639,0.574108,0.835385,0.456084,0.166072,0.415009,0.734686,0.468204,0.574108,0.473281,0.767733,0.649038,0.516524,0.448177,0.496204,0.715521,0.599653,0.567967,0.685057,0.769043,0.746788,0.682716,0.372673,0.707301,0.665821,0.667545,0.719042,0.217371,0.62651,0.506553,0.311239,0.634887,0.18698,0.413854,0.501912,0.712861,0.758938,0.541945,0.559731,0.675319
am162btheta_njals_saga,0.571604,0.493187,0.650753,0.447134,0.565319,0.217684,0.659926,1.0,0.431538,0.196624,0.294788,0.36202,0.594079,0.631542,0.58563,0.472729,0.451051,0.373424,0.546365,0.204987,0.519341,0.244019,0.616835,0.530477,0.354123,0.439781,0.327951,0.548664,0.548664,0.560925,0.572489,0.363122,0.636319,0.484036,0.47515,0.393803,0.627671,0.729837,0.625303,0.251909,0.526471,0.346085,0.502687,0.528123,0.522353,0.580505,0.400427,0.385118,0.28729,0.522353,0.318631,0.653924,0.325676,0.453689,0.650699,0.343751,0.126909,0.318347,0.600036,0.347458,0.453689,0.383646,0.593684,0.50782,0.445972,0.347968,0.369912,0.58573,0.431647,0.454094,0.549712,0.602513,0.57933,0.546508,0.294788,0.569385,0.526001,0.542723,0.563981,0.183715,0.486009,0.38504,0.264808,0.476735,0.158732,0.33387,0.429765,0.564401,0.631994,0.456919,0.472682,0.529422
nraNorrFragm64_barlaams_saga,0.500302,0.458971,0.558589,0.375959,0.567094,0.207121,0.504258,0.431538,1.0,0.19022,0.280044,0.444167,0.466231,0.574427,0.45685,0.371386,0.368303,0.282153,0.422545,0.203316,0.474658,0.272693,0.458758,0.40316,0.283228,0.320174,0.369267,0.570566,0.570566,0.463398,0.431274,0.424512,0.513807,0.413439,0.448576,0.288733,0.588541,0.513846,0.473317,0.233336,0.392127,0.388822,0.469147,0.38818,0.476759,0.485109,0.339432,0.3537,0.26481,0.476759,0.307408,0.500916,0.264473,0.392734,0.495879,0.394485,0.087209,0.332258,0.474077,0.301512,0.392734,0.384263,0.492498,0.451806,0.355317,0.289182,0.39631,0.51132,0.285449,0.477045,0.423082,0.487696,0.529796,0.468191,0.280044,0.541109,0.415477,0.515704,0.440841,0.201747,0.434626,0.440683,0.220814,0.386019,0.165685,0.359146,0.393161,0.473056,0.606634,0.367074,0.418251,0.477546
nraNorrFragm81A_benedikts_regla,0.222923,0.198467,0.266439,0.180933,0.335459,0.145317,0.260971,0.196624,0.19022,1.0,0.184795,0.39497,0.23359,0.268211,0.219772,0.181035,0.228651,0.135542,0.206326,0.196138,0.208505,0.109421,0.2275,0.218226,0.16807,0.161384,0.258695,0.319068,0.319068,0.242467,0.227699,0.417568,0.247479,0.227193,0.238423,0.159465,0.274921,0.251557,0.251909,0.113796,0.18757,0.414716,0.291774,0.227377,0.292814,0.253144,0.169326,0.211064,0.139452,0.292814,0.143537,0.256295,0.166758,0.20502,0.259097,0.40817,0.054705,0.24204,0.256478,0.188814,0.20502,0.188203,0.299075,0.280734,0.156384,0.153123,0.399371,0.275029,0.181096,0.245569,0.202257,0.237594,0.311519,0.235191,0.184795,0.294523,0.276979,0.220331,0.277716,0.099354,0.258985,0.39067,0.092735,0.154009,0.072372,0.173703,0.170414,0.238773,0.292489,0.163146,0.175593,0.311465


The score for the two editions of the _Norwegian Homily Book_ may serve as our proof of method: these come to 0.97 similarity. As this compares an edition of Unger's with a Menota transcription, as does our comparison of _Stjórn_ with the remainder of the Menota corpus, we may be confident that the scores give a fair indication of lexical similarity.