# Crawl, load, and split D&C Study Historical Context

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import json
import os
from typing import Iterator
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup, Tag
from tqdm import tqdm

from models.crawl_utils import get_page, save_page
from models.load_dc_historical_context import load_dc_historical_context
from models.load_utils import Loader, load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter

## Crawl

In [None]:
# config
host = 'https://doctrineandcovenantscentral.org/sections/'
source = "dc_historical_context" # use this name in the directories
crawl_dir = f'../data/raw/{source}'
bs_parser = 'html.parser'
delay_seconds = 5
if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)
    

In [None]:
def extract_hrefs_from_elementor_heading_title(html):
    hrefs = []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all div tags with the class 'views-field-title'
    divs_with_class = soup.find_all('div', class_='elementor-heading-title')

    for div in divs_with_class:
        # Find all anchor tags within the div
        anchor_tags = div.find_all('a')

        # Extract href attribute from each anchor tag and append to the list
        for anchor in anchor_tags:
            href = anchor.get('href')
            if href:
                hrefs.append(urljoin(host, href))

    return hrefs

In [None]:
def get_path(url):
    if not url.endswith('/'):
        url += '/'
    path_components = urlparse(url).path.split('/')
    return os.path.join(crawl_dir, f"{path_components[-2]}.json")

In [None]:
study_links = []

status_code, html = get_page("https://doctrineandcovenantscentral.org/sections/", delay_seconds)
if status_code != 200:
    print("Error!", status_code)
hrefs = extract_hrefs_from_elementor_heading_title(html)
study_links.extend(hrefs)
print(len(study_links))
study_links[:5]

In [None]:
def extract_hrefs(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    
    target_text = "Context Helps"
    for div in soup.find_all('div'):
        if div.get_text(strip=True).startswith(target_text):
            next_sibling = div.find_next_sibling()
            if next_sibling and next_sibling.find('a', href=True):
                first_href = next_sibling.find('a', href=True)['href']
                first_href = first_href.split('?')[0]
                return urljoin(base_url, first_href)
    return None

In [None]:
historical_links = []
for link in study_links:

    status_code, html_content = get_page(link, delay_seconds)
    if status_code != 200:
        print("Error!", status_code)
    href_value = extract_hrefs(html_content, link)
    print(href_value)
    historical_links.append(href_value)
print(len(historical_links))

In [None]:
for url in historical_links:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)
    
print("End")

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
load_dir = f'../data/load/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = Loader(load_dc_historical_context, crawl_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print("metadat: ", docs[0].metadata)
print()
print("content: ", docs[0].page_content)

In [None]:
save_docs_to_jsonl(docs, load_filename)

## Split

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
save_docs_to_jsonl(splits, split_filename)