# Crawl, load, and split Enclopedia of Mormonism

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import json
import os
import re
from typing import Iterator
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup, Tag
from tqdm import tqdm

from models.crawl_utils import get_page, save_page
from models.load_encyclopedia import load_encyclopedia
from models.load_utils import Loader, load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter

## Crawl

In [None]:
# config
source = "encyclopedia"
base = 'https://eom.byu.edu/'
host = ' https://eom.byu.edu/index.php?title=Special:AllPages'
crawl_dir = f'../data/raw/{source}'
bs_parser = 'html.parser'
delay_seconds = 15

if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)
    

In [None]:
def get_path(url):
    query_components = urlparse(url).query.split('=')
    path = query_components[-1].lower()
    path = re.sub(r'\W+', '-', path)
    return os.path.join(crawl_dir, f"{path}.json")

In [None]:
def extract_hrefs_from_div(soup, base):
    hrefs = []

    # Find the div with class "mw-allpages-body"
    div_with_class = soup.find('div', class_='mw-allpages-body')

    if div_with_class:
        # Find all <a> tags within the div
        a_tags = div_with_class.find_all('a')

        for a_tag in a_tags:
            if "class" in a_tag.attrs and "mw-redirect" in a_tag["class"]:
                continue
                
            # Get the href attribute
            href = a_tag.get('href')

            if href:
                # Make the href an absolute URL based on the base_url
                absolute_url = urljoin(base, href)
                hrefs.append(absolute_url)

    return hrefs

In [None]:
def extract_next_page_href(soup, base):
    # Find the div with class "mw-allpages-nav"
    div_with_class = soup.find('div', class_='mw-allpages-nav')

    if div_with_class:
        # Find all <a> tags within the div
        a_tags = div_with_class.find_all('a')

        for a_tag in a_tags:
            # Check if the text of the <a> tag starts with "Next page"
            if a_tag.text.startswith("Next page"):
                # Get the href attribute
                href = a_tag.get('href')
                if href:
                    # Make the href an absolute URL based on the base_url
                    absolute_url = urljoin(base, href)
                    return absolute_url

    # If no matching <a> tag is found, return None
    return None

In [None]:
# Starting URL
start_url = host

# Initialize an empty list to store all hrefs
all_hrefs = []

while start_url:
    # Fetch the content of the current page
    print('fetch', start_url)
    status_code, html = get_page(start_url, delay_seconds)
    if status_code != 200:
        print(f"Failed to fetch {start_url}")
        break

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(html, 'html.parser')

    # Extract hrefs and the URL of the next page
    hrefs = extract_hrefs_from_div(soup, base)
    next_page_url = extract_next_page_href(soup, base)

    # Add the extracted hrefs to the list
    print('found', len(hrefs))
    all_hrefs.extend(hrefs)

    # If there is a next page URL, update the start_url for the next iteration
    if next_page_url:
        start_url = next_page_url
    else:
        # If there is no next page URL, break the loop
        break

len(all_hrefs)

In [None]:
for href in all_hrefs:
    path_file =  get_path(href)
    print(href, path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(href, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , href)
        continue
    save_page(path_file,href,html)

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
load_dir = f'../data/load/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = Loader(load_encyclopedia, crawl_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print("metadat: ", docs[0].metadata)
print()
print("content: ", docs[0].page_content)

In [None]:
save_docs_to_jsonl(docs, load_filename)

## Split

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:25]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
save_docs_to_jsonl(splits, split_filename)