# Pre-processing Michel de Montaigne's Words

## Purpose

The aim here is to download and process Michel de Montaigne's written texts in order to use this information elsewhere.

## Protocol

1. Fetch the Web Page: Use a library like requests to fetch the content of the webpage.
2. Parse the HTML: Use a library like BeautifulSoup to parse the HTML and extract the relevant content.
3. Clean the Extracted Text: Remove any extra whitespace, footnote references, and other artifacts.


In [119]:
# Imports
import requests
from bs4 import BeautifulSoup
import re
import json
import os

In [122]:
# Set the URL
URL = "https://www.gutenberg.org/files/3600/3600-h/3600-h.htm"
FINAL_JSON_NAME = 'montaigne.json'
RESULT_PATH = os.path.join('..', 'data', 'processed', FINAL_JSON_NAME)

In [4]:
# Fetch the web page
response = requests.get(URL)
response.raise_for_status()

In [5]:
# Parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

In [103]:
# Remove square brackets and all intervening substrings in string
def remove_brackets(string):
    while True:
        try:
            start = string.index('[')
            end = string.index(']')
            string = string[:start] + string[end + 1:]
        except ValueError:
            return string

# Process the body text
def preprocess_body(s: str) -> str:
    s = s.encode('ascii', 'ignore').decode('ascii')        # process encoding
    s = remove_brackets(s)                                 # remove any text between square brackets
    s = s.replace('\r', ' ').replace('\n', ' ').strip()    # remove newlines, carriage returns, & hyphens
    s = ' '.join(s.split())                                # multiple spaces to one space
    s = s.lower()                                          # convert to lowercase

    return s

# Process the headings
def preprocess_title(s: str) -> str:    
    # remove newlines, carriage returns, & hyphens
    s = s.replace('\r', ' ').replace('\n', ' ').strip().replace('——', ' ')
    # multiple spaces to one space
    s = ' '.join(s.split())
    # process encoding
    s = s.encode('ascii', 'ignore').decode('ascii')

    return s


def has_roman_numeral(s: str) -> bool:
    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", s.replace('.', '')))


In [116]:
# Extract the main content. Adjust accordingly if the structure is different.
# This is based on the assumptiom that Montaigne's essay titles are within the <h2> tags.
# & text is within the <p> tags. Create a nested object structure

target = soup.find('h2')
result = dict()
current_title = None
for sib in target.find_next_siblings():
    if sib.name == 'h2':
        current_title = preprocess_title(sib.text)
        result.setdefault(current_title, list())
    elif current_title and sib.name == 'p':
        text = preprocess_body(sib.text)
        if text:
            result[current_title].append(text)

keys = list(result.keys())
for k in keys:
    for substr in k.split(' '):
        rn = has_roman_numeral(substr)
        if rn:
            break
    if not rn:
        del result[k]

result = { k: {"text": " ".join(v).lower()} for k, v in result.items() }

# k = list(result.keys())[0]
# print(k)
# print(result[k])

In [123]:
# Save result to json
with open(RESULT_PATH, "w") as f:
    json.dump(result, f)