# _Stjórn_ Plaintext Corpus Generation

HTML from [heimskringla.no](https://heimskringla.no/wiki/Stj%C3%B3rn) following [Unger 1862](https://archive.org/details/stjorngammelnors00unge/page/n5/mode/2up), who primarily followed AM 226.

In [1]:
import os,re,json,copy
from urllib.request import urlretrieve
from pathlib import Path
from bs4 import BeautifulSoup, Comment

Path("raw").mkdir(exist_ok=True)
Path("clean").mkdir(exist_ok=True)
Path("plaintext").mkdir(exist_ok=True)
Path("nlp").mkdir(exist_ok=True)
Path("split/commentary").mkdir(parents=True, exist_ok=True)
Path("split/bible").mkdir(exist_ok=True)
Path("split/unmarked").mkdir(exist_ok=True)

In [2]:
remote = {
        'prologue': 'https://heimskringla.no/wiki/Prolog_(Stj%C3%B3rn)',
        'introduction': 'https://heimskringla.no/wiki/Indledning_(Stj%C3%B3rn)',
        'gn': 'https://heimskringla.no/wiki/I._Mosebog',
        'ex': 'https://heimskringla.no/wiki/II._Mosebog',
        'lv': 'https://heimskringla.no/wiki/III._Mosebog',
        'nm': 'https://heimskringla.no/wiki/IV._Mosebog',
        'dt': 'https://heimskringla.no/wiki/V._Mosebog',
        'ios': 'https://heimskringla.no/wiki/Josv%C3%A6_Bog',
        'idc': 'https://heimskringla.no/wiki/Dommernes_Bog',
        'rt': 'https://heimskringla.no/wiki/Ruths_Bog',
        '1sm': 'https://heimskringla.no/wiki/I._Samuels_Bog',
        '2sm': 'https://heimskringla.no/wiki/II._Samuels_Bog',
        '3rg': 'https://heimskringla.no/wiki/I._Kongernes_Bog',
        '4rg': 'https://heimskringla.no/wiki/II._Kongernes_Bog'
        }

path = 'raw/'
for title, url in remote.items():
    local = os.path.join(path, title) + '.html'
    if not(os.path.exists(local) and os.path.getsize(local) > 0):
        urlretrieve(url, local)

In [3]:
# TODO: normalize remaining long vowels?

documents = ['prologue', 'introduction', 'gn', 'ex', 'lv', 'nm', 'dt', 'ios', 'idc', 'rt', '1sm', '2sm', '3rg', '4rg']

def normalize(target):
    matrix = {
        'j': 'i',
        'v': 'u',
        'ę': 'æ',
        'ẻ': 'æ',
        'ỏ': 'ǫ',
        'đ': 'ð',
        'afþeirri': 'af þeirri' # an error in the HTML of 1 Samuel
    }
    for k,v in matrix.items():
        target = target.replace(k, v)
    return target

with open('rubrics.json') as json_data:
    rubrics = json.load(json_data)

In [4]:
for book in documents:
    raw_html = 'raw/' + book + '.html'
    clean_html = 'clean/' + book + '.html'
    text_file = 'plaintext/' + book + '.txt'
    nlp_file = 'nlp/' + book + '.txt'
    commentary_file = 'split/commentary/' + book + '.txt'
    bible_file = 'split/bible/' + book + '.txt'
    unmarked_file = 'split/unmarked/' + book + '.txt'
    commentary = []
    bible = []
    unmarked = []
    with open(raw_html) as html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
        # Since heimskringla.no lacks a class name for the main text that we could select for,
        # we'll just nuke all the unwanted nodes instead:
        unwanted_elements = ['title', 'script', 'meta', 'link', 'center', 'a', 'sup', 'table']
        unwanted_classes = ['.mw-references-wrap', '.printfooter', '.catlinks', '.visualClear', '.mw-indicators mw-body-content', '.toccolours', '.thumb tright', '.thumbinner', '.thumbcaption', '.magnify'] # spaced entries aren't caught
        unwanted_ids = ['mw-page-base', 'mw-head-base', 'mw-navigation', 'toc', 'siteSub', 'contentSub', 'jump-to-nav', 'mw-parser-output', 'firstHeading', 'footer', 'footer-info-lastmod']
        #unwanted_attribute_elements = ['div, span']
        for element in unwanted_elements:
            for match in soup.find_all(element):
                match.decompose()
        for attr_class in unwanted_classes:
            for match in soup.css.select(attr_class):
                match.decompose()
        for match in soup.select('div[class^=toclimit]'):
            match.decompose()
        for match in soup.select('div[class^=thumb]'):
            match.decompose()
        for match in soup.select('div[class^=mw-indicators]'):
            match.decompose()
        for identifier in unwanted_ids:
            for match in soup.find_all(id=identifier):
                match.decompose()
        for match in soup.find_all("b", string="Fotnoter"):
            match.decompose()
        for match in soup.find_all("b", string="Fotnoter:"):
            match.decompose()
        for match in soup.find_all("i", string="Innskudd fra Andre Krønikebok, kapittel 20."):
            match.decompose()
        for match in soup.find_all("p", string="()"):
            match.decompose()
        for match in soup.find_all("i"):
            match.insert(0, '{')
            match.append('}')
        all_divs = soup.find_all('div')
        for div in all_divs:
            for element in div(string=lambda string: isinstance(string, Comment)):
                element.extract()
        soup.head.clear()
        with open(clean_html, 'w') as file:
            file.write(str(soup))
        # Here is where I delete rubrics and chapter numbers
        # to leave only the text for NLP evaluation.
        # Comment out span to reintroduce chapter headings!
        last_unwanted_elements = ['b', 'span']
        for element in last_unwanted_elements:
            for match in soup.find_all(element):
                match.decompose()
        with open(text_file, 'w') as file:
            file.write(soup.get_text())
        # Now reopen as plaintext to remove punctuation and blank lines:
        unwanted_chars = ['.', ':', ';', '?', '!', '[', ']', '(', ')']
        plaintext = open(text_file).readlines()
        flattened = []
        for line in plaintext:
            for character in unwanted_chars:
                line = line.replace(character, '')
            line = line.replace('{', '[').replace('}', ']')
            line = normalize(' '.join(line.lower().lstrip().split()))
            if not re.match('^$', line):
                flattened.append(line + '\n')
        with open(nlp_file, 'w') as f:
            f.writelines(flattened)
        # Now further process the plaintext to split the content into Bible, commentary, unmarked
        # (only Gn and Ex have such references):
        if book in ['gn', 'ex']:
            split_document = '\n'.join(flattened)
            split_document = re.sub('\[', '\n[', split_document)
            split_lines = split_document.split('\n')
            for line in split_lines:
                if not re.match('^$', line):
                    source_ref = re.match('\[([^]]*)\]', line)
                    if source_ref:
                        reference = source_ref.group(0).strip('[]')
                        if reference in rubrics['bible']:
                            bible.append(line.replace(']', ':').strip('[') + '\n')
                        elif reference in rubrics['commentary']:
                            commentary.append(line.replace(']', ':').strip('\[') + '\n')
                    else:
                        unmarked.append(line + '\n')
            with open(bible_file, 'w') as f:
                f.writelines(bible)
            with open(commentary_file, 'w') as f:
                f.writelines(commentary)
            with open(unmarked_file, 'w') as f:
                f.writelines(unmarked)