# _Norwegian Homily Book_ Plaintext Corpus Generation

HTML from [heimskringla.no](https://heimskringla.no/wiki/Gammel_norsk_Homiliebog) following [Unger 1864](https://archive.org/details/gammelnorskhomi00ungegoog). A recent transcription of this manuscript is available in [Menota](https://www.menota.org).

In [1]:
import os,re,json,copy
from urllib.request import urlretrieve
from pathlib import Path
from bs4 import BeautifulSoup, Comment

Path("nhb/raw").mkdir(parents=True, exist_ok=True)
Path("nhb/clean").mkdir(parents=True, exist_ok=True)
Path("nhb/plaintext").mkdir(parents=True, exist_ok=True)
Path("nhb/nlp").mkdir(parents=True, exist_ok=True)

In [2]:
documents = ['alcuin', 'hom', 'olafr', 'visio', 'paternoster', 'anhang1', 'anhang2']

# TODO: normalize remaining long vowels?
# TODO: normalize ꜵ? Problem is it sometimes represents a, sometimes á, sometimes ǫ; so turn into a?
def normalize(target):
    matrix = {
        'j': 'i',
        'v': 'u',
        'ę': 'æ',
        'ẻ': 'æ',
        'ỏ': 'ǫ',
        'đ': 'ð'
    }
    for k,v in matrix.items():
        target = target.replace(k, v)
    return target

In [3]:
remote = {
        'alcuin': 'https://heimskringla.no/wiki/Cve%C3%B0iusending_Alquini_diaconi',
        'hom': 'https://heimskringla.no/wiki/Homilier',
        'olafr': 'https://heimskringla.no/wiki/In_die_sancti_Olaui_regis_et_martiris',
        'visio': 'https://heimskringla.no/wiki/Visio_sancti_Pauli_apostoli',
        'paternoster': 'https://heimskringla.no/wiki/Fa%C3%B0er_var',
        'anhang1': 'https://heimskringla.no/wiki/Anhang_I_(Gammel_norsk_Homiliebog)',
        'anhang2': 'https://heimskringla.no/wiki/Anhang_II_(Gammel_norsk_Homiliebog)'
        }

path = 'nhb/raw'
for title, url in remote.items():
    local = os.path.join(path, title) + '.html'
    if not(os.path.exists(local) and os.path.getsize(local) > 0):
        urlretrieve(url, local)

In [4]:
for book in documents:
    raw_html = 'nhb/raw/' + book + '.html'
    clean_html = 'nhb/clean/' + book + '.html'
    text_file = 'nhb/plaintext/' + book + '.txt'
    nlp_file = 'nhb/nlp/' + book + '.txt'
    with open(raw_html) as html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
        # Since heimskringla.no lacks a class name for the main text that we could select for,
        # we'll just nuke all the unwanted nodes instead:
        unwanted_elements = ['title', 'script', 'meta', 'link', 'center', 'a', 'sup', 'table']
        unwanted_classes = ['.mw-references-wrap', '.printfooter', '.catlinks', '.visualClear', '.mw-indicators mw-body-content', '.toccolours', '.thumb tright', '.thumbinner', '.thumbcaption', '.magnify'] # spaced entries aren't caught
        unwanted_ids = ['mw-page-base', 'mw-head-base', 'mw-navigation', 'toc', 'siteSub', 'contentSub', 'jump-to-nav', 'mw-parser-output', 'firstHeading', 'footer', 'footer-info-lastmod']
        #unwanted_attribute_elements = ['div, span']
        for element in unwanted_elements:
            for match in soup.find_all(element):
                match.decompose()
        for attr_class in unwanted_classes:
            for match in soup.css.select(attr_class):
                match.decompose()
        for match in soup.select('div[class^=toclimit]'):
            match.decompose()
        for match in soup.select('div[class^=thumb]'):
            match.decompose()
        for match in soup.select('div[class^=mw-indicators]'):
            match.decompose()
        for identifier in unwanted_ids:
            for match in soup.find_all(id=identifier):
                match.decompose()
        for match in soup.find_all("b", string="Fotnoter"):
            match.decompose()
        for match in soup.find_all("b", string="Fotnoter:"):
            match.decompose()
        for match in soup.find_all("i", string="Innskudd fra Andre Krønikebok, kapittel 20."):
            match.decompose()
        for match in soup.find_all("p", string="()"):
            match.decompose()
        for match in soup.find_all("i"):
            match.insert(0, '{')
            match.append('}')
        all_divs = soup.find_all('div')
        for div in all_divs:
            for element in div(string=lambda string: isinstance(string, Comment)):
                element.extract()
        soup.head.clear()
        with open(clean_html, 'w') as file:
            file.write(str(soup))
        # Here is where I delete rubrics and chapter numbers
        # to leave only the text for NLP evaluation.
        # Comment out span to reintroduce chapter headings!
        last_unwanted_elements = ['b', 'span']
        for element in last_unwanted_elements:
            for match in soup.find_all(element):
                match.decompose()
        with open(text_file, 'w') as file:
            file.write(soup.get_text())
        # Now reopen as plaintext to remove punctuation and blank lines:
        unwanted_chars = ['.', ':', ';', '?', '!', '[', ']', '(', ')', '| ']
        plaintext = open(text_file).readlines()
        flattened = []
        for line in plaintext:
            for character in unwanted_chars:
                line = line.replace(character, '')
            line = line.replace('{', '[').replace('}', ']')
            line = normalize(' '.join(line.lower().lstrip().split()))
            if not re.match('^$', line):
                flattened.append(line + '\n')
        with open(nlp_file, 'w') as f:
            f.writelines(flattened)