# TF-IDF

We'll rely on term frequency times inverted document frequency to measure meaningful similarity between documents. Let's start by generating a matrix for the separate constituent parts of _Stjórn_.

In [1]:
import os,glob,json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def normalize(target):
    # This dict further standardizes the text beyond the rule set of stjorn-extract.ipynb:
    matrix = {
        'j': 'i',
        'v': 'u',
        'ð': 'þ',
        'á': 'a',
        'ǽ': 'æ',
        'é': 'e',
        'í': 'i',
        'ó': 'o',
        'ú': 'u',
        'ý': 'y',
        'ǿ': 'ø',
        'k': 'c',
        '[': '',
        ']': ''
        }
    for k,v in matrix.items():
        target = target.replace(k, v)
    return target

titles = ['prologue', 'introduction', 'gn', 'ex', 'lv', 'nm', 'dt', 'ios', 'idc', 'rt', '1sm', '2sm', '3rg', '4rg']
tokens = []
for title in titles:
    with open(f"nlp/{title}.txt") as raw:
        document = raw.read().replace('\n', ' ')
        tokens.extend(document.split())

work_indices = {
    'stjorn1': (650,124417),
    'stjorn2': (124417,147678),
    'stjorn3': (147678,156943,160719),
    'stjorn4': (156943,160719)
}

stjorn = dict()
for _work, _range in work_indices.items():
    if len(_range) == 2:
        stjorn[_work] = normalize(' '.join(tokens[_range[0]:_range[1]]))
    else:
        stjorn[_work] = normalize(' '.join(tokens[_range[0]:_range[1]] + tokens[_range[2]:]))

menota = dict()
for text in glob.glob('../menota/dipl/*txt'):
    ref = os.path.basename(text).replace('.txt', '')
    with open(text) as doc:
        # NB for present purposes I'm subjecting Menota to the same normalization standard as Stjórn:
        menota[ref] = normalize(doc.read().replace('\n', ''))

In [3]:
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(stjorn.values())
df = pd.DataFrame(cosine_similarity(model), stjorn.keys(), stjorn.keys())
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4
stjorn1,1.0,0.811733,0.90377,0.842752
stjorn2,0.811733,1.0,0.797618,0.833766
stjorn3,0.90377,0.797618,1.0,0.835339
stjorn4,0.842752,0.833766,0.835339,1.0


After eliminating vowel length and the þ/ð distinction, these are now all pretty similar to one another, with the biggest difference between _Stjórn II_ and _III_.

Now let's first add _Konungs skuggsjá_ from Menota, as well as Unger's own edition of the _Norwegian Homily Book_. Fingers crossed that we have got the normalization standard of the former to approach Unger's methods reasonably well.

In [4]:
nhb_titles = ['alcuin', 'hom', 'olafr', 'visio', 'paternoster', 'anhang1'] # this is the sequence matched in Menota
nhb = ''
for title in nhb_titles:
    filepath = f'../nhb/nlp/{title}.txt'
    with open(filepath) as doc:
        nhb = nhb + doc.read().replace('\n', '')
stjorn_plus = []
for v in stjorn.values():
    stjorn_plus.append(v)
stjorn_plus.extend([menota['nks235g_konungs_skuggsja'], nhb])
model = vectorizer.fit_transform(stjorn_plus)
df = pd.DataFrame(cosine_similarity(model), list(stjorn.keys()) + ['ks', 'nhb'], list(stjorn.keys()) + ['ks', 'nhb'])
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,ks,nhb
stjorn1,1.0,0.794566,0.883015,0.812126,0.705122,0.711161
stjorn2,0.794566,1.0,0.77382,0.803064,0.690873,0.671303
stjorn3,0.883015,0.77382,1.0,0.797769,0.739994,0.740641
stjorn4,0.812126,0.803064,0.797769,1.0,0.656851,0.677968
ks,0.705122,0.690873,0.739994,0.656851,1.0,0.664858
nhb,0.711161,0.671303,0.740641,0.677968,0.664858,1.0


_Stjórn III_ and _Konungs skuggsjá_ share material cognate within the vernacular, so this connection standing out as the strongest between _Konungs skuggsjá_ and the constituent parts of _Stjórn_ comes as no surprise; if anything, the difference in match with the other parts is rather small. In fact, the _Norwegian Homily Book_ has a higher match with _Stjórn III_ than _Konungs skuggsjá_ does.

Next, let's model all of Menota along with Stjórn. Perhaps we'll leave Unger's _Homily Book_ in alongside the Menota edition, just for comparison's sake.

In [5]:
corpus = []
titles = []
for k,v in stjorn.items():
    titles.append(k)
    corpus.append(v)
titles.append('nhb')
corpus.append(nhb)
for k,v in menota.items():
    titles.append(k)
    corpus.append(v)
model = vectorizer.fit_transform(corpus)
df = pd.DataFrame(cosine_similarity(model), titles, titles)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df

Unnamed: 0,stjorn1,stjorn2,stjorn3,stjorn4,nhb,nraNorrFragm75_kross_saga,am132_egils_saga,am162btheta_njals_saga,nraNorrFragm64_barlaams_saga,nraNorrFragm81A_benedikts_regla,am1056IX_konungs_skuggsja_fragment,am78_kristinrettir,am63_heimskringla3,dg4-7_strengleikar,am132_droplaugasona_saga,am132_kormaks_saga,nraNorrFragm72x76_dialogar,nraNorrFragm53_haralds_saga_hardrada,am132_finnboga_saga,nraNorrFragm70_agotu_saga,nraNorrFragm62_karlamagnuss_saga,nraNorrFragm60A_stjorn,am132_fostbraedra_saga,lbsFragm82_olafs_saga_helga,nraNorrFragm58B_konungs_skuggsja,nraNorrFragm60C_stjorn,holmPerg30_landslog,am619_norwegian_homily_book,nhb_am619,nraNorrFragm57_jons_saga_helga,nraNorrFragm69_nikulass_saga,am56_landslog,wolfAug9-10_egils_saga,nraNorrFragm66_thomass_saga,holmPerg17_thomass_saga,am383I_thorlaks_saga,holmPerg4_thidreks_saga,am132_njals_saga,am36_heimskringla2,am544_voluspa,am162bkappa_njals_saga,am305_landslog,nraNorrFragm58C_konungs_skuggsja,am132_olkofra_thattr,konungs_skuggsja_am243ba,nraNorrFragm54_sverris_saga,nraNorrFragm55B_hakonar_saga,nraNorrFragm79_mariu_saga,gks2365_voluspa,am243balpha_konungs_skuggsja,nraNorrFragm51_fagrskinna,am132_viga-glums_saga,am279a_gragas,am677_gregory,am132_laxdoela_saga,am302_landslog,am178_thidreks_saga,nraNorrFragm81B_benedikts_regla,am132_bandamanna_saga,nraNorrFragm71_gregors_saga_pafa,gregory_am677,am655_laeknisbok,am519a_alexanders_saga,holmPerg34_landslog,am162balpha_njals_saga,am113_islendingabok,nraNorrFragm7_landslog,nraNorrFragm67_thomass_saga,nraNorrFragm56_thorgils_saga,nks235g_konungs_skuggsja,am132_hallfredar_saga,am35_heimskringla1,am242_codex_wormianus,nraNorrFragm78_mariu_saga,konungs_skuggsja_fragment_am1056xi,dg8II_olafs_saga,nraNorrFragm80_pals_saga,nraNorrFragm63_karlamagnuss_saga,nraNorrFragm77_dialogar,am28_codex_runicus,holmPerg34_boejarlog,dg8I_landslog,nraNorrFragm60B_stjorn,nraNorrFragm55A_hakonar_saga,skbA120_marys_complaint,nraNorrFragm59_rimbegla,nraNorrFragm65_floress_saga,nraNorrFragm52_olafs_saga_helga_hin_elzta,holmPerg6_barlaams_saga,nraNorrFragm68_brendanuss_saga,nraNorrFragm61_karlamagnuss_saga,nraNorrFragm58A_konungs_skuggsja
stjorn1,1.0,0.675326,0.782961,0.649681,0.492555,0.25713,0.672272,0.571469,0.499756,0.222804,0.368045,0.420957,0.649526,0.728539,0.58155,0.47637,0.493932,0.391121,0.575159,0.23319,0.597929,0.356609,0.627396,0.573349,0.473806,0.484693,0.361864,0.631456,0.631456,0.605015,0.623539,0.423872,0.651883,0.601887,0.628798,0.423834,0.690073,0.675837,0.669838,0.310528,0.519057,0.41294,0.560838,0.506388,0.554455,0.668081,0.431882,0.444717,0.328786,0.554455,0.385541,0.655222,0.341152,0.512041,0.661015,0.401738,0.149348,0.356142,0.600724,0.367715,0.512041,0.455764,0.66286,0.553683,0.497843,0.366937,0.425181,0.669123,0.465248,0.545201,0.55181,0.662937,0.71005,0.637883,0.368045,0.617452,0.610302,0.641282,0.59319,0.227007,0.519909,0.46997,0.293146,0.511849,0.195335,0.432353,0.453836,0.605947,0.725827,0.48811,0.485022,0.582661
stjorn2,0.675326,1.0,0.653911,0.628009,0.437318,0.252721,0.603981,0.493013,0.458417,0.198335,0.325308,0.412751,0.56741,0.627519,0.540143,0.438286,0.442036,0.342971,0.50049,0.214354,0.54442,0.280095,0.564592,0.507353,0.431038,0.390873,0.373688,0.574865,0.574865,0.529469,0.518215,0.411041,0.591957,0.488402,0.463665,0.341695,0.641929,0.622266,0.577102,0.285181,0.484935,0.399024,0.527441,0.48217,0.503279,0.615997,0.404818,0.3913,0.312032,0.503279,0.357702,0.596252,0.313254,0.478181,0.598276,0.389665,0.171301,0.325215,0.558047,0.347309,0.478181,0.425372,0.577992,0.563036,0.478668,0.309285,0.412338,0.579205,0.382742,0.506668,0.515591,0.572766,0.608345,0.559135,0.325308,0.556403,0.540826,0.603391,0.533631,0.197242,0.54058,0.464683,0.28011,0.433989,0.165237,0.412673,0.421713,0.557471,0.667936,0.444594,0.457095,0.539131
stjorn3,0.782961,0.653911,1.0,0.617558,0.558765,0.283557,0.782888,0.650656,0.557949,0.26646,0.404185,0.492523,0.738019,0.817113,0.672016,0.549874,0.562407,0.476364,0.649489,0.273746,0.686655,0.348187,0.700427,0.691777,0.504335,0.590596,0.437523,0.745472,0.745472,0.685471,0.685769,0.500364,0.769609,0.642496,0.665807,0.465623,0.810513,0.778592,0.767622,0.343643,0.560275,0.46266,0.619054,0.606494,0.66589,0.743155,0.487555,0.508651,0.366842,0.66589,0.414449,0.750953,0.385062,0.563897,0.764406,0.453766,0.174372,0.434065,0.693949,0.439126,0.563897,0.49125,0.762975,0.609893,0.523808,0.411347,0.513291,0.752965,0.533745,0.595866,0.649572,0.747001,0.788836,0.706231,0.404185,0.735,0.675925,0.680056,0.671156,0.240285,0.571143,0.533519,0.360541,0.592709,0.208276,0.432328,0.501336,0.69396,0.811898,0.567626,0.571121,0.657205
stjorn4,0.649681,0.628009,0.617558,1.0,0.390793,0.230024,0.558791,0.447011,0.375514,0.180816,0.272291,0.350413,0.519426,0.544644,0.483863,0.382978,0.399617,0.320178,0.439559,0.173297,0.466017,0.233238,0.508276,0.466593,0.355954,0.343148,0.319678,0.506332,0.506332,0.46246,0.458896,0.351196,0.552121,0.421244,0.408523,0.288019,0.536428,0.544297,0.533058,0.253038,0.440678,0.327965,0.457137,0.432359,0.449888,0.545974,0.3641,0.32932,0.274337,0.449888,0.296037,0.510786,0.309309,0.403086,0.527954,0.323849,0.132732,0.284976,0.470232,0.325042,0.403086,0.348236,0.511213,0.439136,0.410324,0.281738,0.349832,0.503598,0.346979,0.416537,0.449239,0.5239,0.551937,0.488134,0.272291,0.482204,0.476737,0.51289,0.476818,0.164197,0.418249,0.396431,0.226308,0.421251,0.139214,0.339031,0.33795,0.480006,0.541448,0.410791,0.37618,0.462136
nhb,0.492555,0.437318,0.558765,0.390793,1.0,0.25037,0.529563,0.419653,0.422384,0.242638,0.316659,0.458792,0.481598,0.580994,0.445391,0.370206,0.448593,0.276967,0.429292,0.213858,0.443486,0.23721,0.474513,0.439375,0.333576,0.341982,0.406192,0.69059,0.69059,0.484981,0.482316,0.466477,0.526099,0.452259,0.505967,0.323892,0.563169,0.518305,0.511633,0.215545,0.394203,0.418424,0.512864,0.428579,0.531002,0.490732,0.340248,0.392438,0.255002,0.531002,0.315468,0.519681,0.316827,0.436901,0.516215,0.408762,0.121408,0.400005,0.494907,0.36807,0.436901,0.354081,0.56779,0.512687,0.348563,0.303465,0.460969,0.546691,0.343488,0.448801,0.423848,0.497096,0.584292,0.504861,0.316659,0.576935,0.51689,0.471516,0.540531,0.162906,0.475238,0.486189,0.205181,0.347648,0.13651,0.340093,0.375886,0.491818,0.588247,0.371526,0.378979,0.541331
nraNorrFragm75_kross_saga,0.25713,0.252721,0.283557,0.230024,0.25037,1.0,0.303282,0.217655,0.206812,0.145269,0.158253,0.242856,0.277544,0.329805,0.270306,0.209865,0.255272,0.161322,0.224027,0.122291,0.231194,0.115309,0.278778,0.253076,0.177322,0.171645,0.217199,0.336862,0.336862,0.266421,0.234196,0.248525,0.288795,0.256137,0.233212,0.151978,0.287453,0.274811,0.290538,0.148519,0.202691,0.228923,0.307838,0.242924,0.262155,0.270159,0.199027,0.26225,0.190286,0.262155,0.16608,0.273967,0.188115,0.255125,0.297257,0.23189,0.080014,0.186243,0.262946,0.227942,0.255125,0.209693,0.328672,0.311736,0.17541,0.152251,0.24855,0.306993,0.191313,0.242692,0.225686,0.287406,0.343457,0.271376,0.158253,0.277595,0.313046,0.249971,0.330551,0.077687,0.279851,0.252306,0.108267,0.189114,0.060095,0.189686,0.164869,0.271931,0.311882,0.215398,0.198951,0.32864
am132_egils_saga,0.672272,0.603981,0.782888,0.558791,0.529563,0.303282,1.0,0.65998,0.503681,0.261062,0.372197,0.477452,0.73902,0.769926,0.736655,0.590662,0.600116,0.520293,0.678839,0.240613,0.620659,0.294814,0.756492,0.707896,0.405397,0.5301,0.427044,0.706392,0.706392,0.692988,0.676718,0.481969,0.920563,0.596722,0.597011,0.47923,0.75436,0.823803,0.769062,0.316958,0.582949,0.458679,0.651446,0.665868,0.643542,0.720851,0.529272,0.493646,0.35576,0.643542,0.413727,0.798636,0.423542,0.574002,0.835433,0.456027,0.166017,0.414778,0.734677,0.468057,0.574002,0.473232,0.767571,0.648709,0.516319,0.447904,0.496093,0.715192,0.599626,0.567698,0.684922,0.767451,0.746594,0.682335,0.372197,0.70668,0.665528,0.667307,0.718844,0.217061,0.626272,0.506378,0.311251,0.635005,0.186482,0.413734,0.501741,0.712803,0.758606,0.54186,0.559642,0.675131
am162btheta_njals_saga,0.571469,0.493013,0.650656,0.447011,0.419653,0.217655,0.65998,1.0,0.431071,0.196653,0.294424,0.362027,0.592878,0.631393,0.585718,0.472721,0.45055,0.373613,0.546452,0.204761,0.519276,0.243402,0.61687,0.53034,0.354018,0.439778,0.327893,0.548293,0.548293,0.56093,0.572465,0.363074,0.63643,0.48387,0.474993,0.393855,0.627655,0.729871,0.624168,0.251901,0.526338,0.346061,0.50261,0.528245,0.52148,0.580575,0.400476,0.385057,0.287277,0.52148,0.318073,0.653963,0.325631,0.453661,0.650781,0.34374,0.126887,0.318141,0.600105,0.347382,0.453661,0.383825,0.593619,0.507621,0.445859,0.347751,0.369888,0.585551,0.431796,0.453915,0.549632,0.601277,0.579202,0.546256,0.294424,0.56891,0.525806,0.542516,0.563899,0.183477,0.485885,0.384965,0.264859,0.476788,0.158335,0.333822,0.429674,0.564422,0.631728,0.456856,0.472638,0.529305
nraNorrFragm64_barlaams_saga,0.499756,0.458417,0.557949,0.375514,0.422384,0.206812,0.503681,0.431071,1.0,0.190056,0.279519,0.443677,0.464759,0.573755,0.45635,0.370886,0.367645,0.281864,0.422122,0.203221,0.474148,0.271939,0.458281,0.402595,0.282921,0.319866,0.368883,0.569717,0.569717,0.462881,0.430806,0.423985,0.513249,0.41294,0.448081,0.288484,0.587888,0.513285,0.471944,0.233104,0.391669,0.38839,0.468639,0.387784,0.475672,0.484603,0.339122,0.353323,0.264592,0.475672,0.306593,0.500336,0.264233,0.392349,0.495322,0.394058,0.087115,0.331735,0.473583,0.301133,0.392349,0.383496,0.491899,0.451208,0.354904,0.288748,0.395834,0.510744,0.285193,0.476401,0.422493,0.486132,0.529193,0.467667,0.279519,0.540005,0.415125,0.515098,0.440329,0.201334,0.434084,0.440181,0.220573,0.385561,0.165122,0.358752,0.392722,0.472484,0.605972,0.366629,0.417735,0.477016
nraNorrFragm81A_benedikts_regla,0.222804,0.198335,0.26646,0.180816,0.242638,0.145269,0.261062,0.196653,0.190056,1.0,0.184843,0.39511,0.2332,0.268187,0.219859,0.181037,0.228347,0.135612,0.206423,0.19594,0.208547,0.109181,0.227492,0.21815,0.167977,0.161468,0.258852,0.318941,0.318941,0.242492,0.227697,0.417537,0.247488,0.227214,0.23845,0.159525,0.274941,0.251616,0.251508,0.1138,0.187491,0.41472,0.291895,0.227418,0.292566,0.253154,0.169353,0.211064,0.13951,0.292566,0.143247,0.25633,0.166777,0.20501,0.259156,0.408202,0.054675,0.242004,0.256496,0.188809,0.20501,0.188173,0.299104,0.28087,0.156302,0.153089,0.399235,0.275021,0.181093,0.245722,0.202252,0.237185,0.311528,0.235099,0.184843,0.294384,0.27693,0.220363,0.27762,0.099217,0.259121,0.390469,0.092743,0.154124,0.072166,0.173682,0.170374,0.238817,0.292575,0.16314,0.17559,0.311544


After substantial normalization, the two editions of NHB come to 0.69 similarity, which is still concerning, even if it is 15 percentage points better than before collapsing vowel length and þ/ð. There's probably Menota elements to disable yet, along the lines of `<sic>` and `<note>`. Really we expect a match in the 0.9-range.