# Import Neccessary Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from urllib.parse import urljoin, urlparse
from datetime import datetime
from models.load_bgbmm import BGBMMLoader
from bs4 import BeautifulSoup, Tag
from models.crawl_utils import get_page, save_page


import json
from typing import Iterator
from langchain.document_loaders.base import BaseLoader
from langchain.schema.document import Document
from tqdm import tqdm

from models.load_utils import clean
from models.load_utils import to_markdown


from models.load_utils import save_docs_to_jsonl
from models.split_markdown import RecursiveMarkdownTextSplitter
from models.load_utils import  load_docs_from_jsonl

# Crawl Brant Gardner Book of Mormon Minutes from the Church of Jesus Christ of Latter-day Saints

In [None]:
# config
host = 'https://archive.bookofmormoncentral.org/content/book-mormon-minute'
base_dir = '../data/raw/bgbmm'
bs_parser = 'html.parser'
delay_seconds = 5

if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    

In [None]:
def extract_anchors(host, html):
    # Parse the HTML content
    soup = BeautifulSoup(html, 'html.parser')
    # Find all span tags with class "field-content" that contain anchor tags
    span_tags = soup.find_all('span', class_='field-content')
    # Extract anchor tags within each span tag
    hrefs = []
    for span_tag in span_tags:
        anchor_tag = span_tag.find('a', href = True)
        if not isinstance(anchor_tag, Tag):
            continue
        href = anchor_tag.get('href')
        href = urljoin(host, href)
        hrefs.append(href)
    return hrefs

In [None]:
def get_path(url):
    if not url.endswith('/'):
        url += '/'
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-2]}.json")

In [None]:
bgbmm_links = []

status_code, html = get_page(host, delay_seconds)
if status_code != 200:
    print("Error!", status_code)
hrefs = extract_anchors(host, html)
if len(hrefs) != 0:
    bgbmm_links.extend(hrefs)
    
print(bgbmm_links[:5])
len(bgbmm_links)

In [None]:
for url in bgbmm_links:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)
    
print("End")

# Load Brant Gardner Book of Mormon Minutes from the Church of Jesus Christ of Latter-day Saints

In [None]:
# config
input_dir = '../data/raw/bgbmm/'
output_dir = '../data/load/bgbmm/'

today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
loader = BGBMMLoader(input_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print("metadat: ", docs[0].metadata)
print()
print("content: ", docs[0].page_content)

In [None]:
output_filename = os.path.join(output_dir, f"{today}.jsonl")

save_docs_to_jsonl(docs, output_filename)


# Split Brant Gardner Book of Mormon Minutes from the Church of Jesus Christ of Latter-day Saints

In [None]:
# configure
input_path = '../data/load/bgbmm/2023-11-16.jsonl'
chunk_size = 2000
chunk_overlap = 200
output_dir = '../data/split/bgbmm/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
today = datetime.today().strftime('%Y-%m-%d')

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

In [None]:
text_splitter = RecursiveMarkdownTextSplitter(
    title_header_sbeparator=" / ",
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:25]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)