# Crawl, load, and split fair

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import json
import os
from typing import Iterator
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup, Tag
from tqdm import tqdm

from models.crawl_utils import get_page, save_page
from models.load_fairs import load_fairs
from models.load_utils import Loader, load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter

## Crawl

In [None]:
# config
host = 'https://www.fairlatterdaysaints.org/answers/Special:AllPages'
crawl_dir = '../data/raw/fair' # base_dir is now crawl_dir
bs_parser = 'html.parser'
source = 'fair'
delay_seconds = 15
approved_links = [
    'https://www.fairlatterdaysaints.org/answers/Criticism_of_Mormonism/Online_documents/Letter_to_a_CES_Director',
    'https://www.fairlatterdaysaints.org/answers/Sarah_Allen_CES_Response_Posts',
    'https://www.fairlatterdaysaints.org/answers/Sarah_Allen%27s_Response_to_Letter_For_My_Wife'
    ]

if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)

In [None]:
def get_path(url):
    if url.endswith('/'):
        url = url[:-1]
    path_components = urlparse(url).path.split('/')
    return os.path.join(crawl_dir, f"{path_components[-1]}.json")
    

In [None]:
def get_links(soup, base_url):
    list_tags = soup.find_all(['ol', 'ul'])
    links = set()
    for list_tag in list_tags:
        a_tags = list_tag.find_all('a', href=True)
        for a_tag in a_tags:
            url = urljoin(base_url, a_tag['href']).split('#')[0]
            links.add(url)
    return list(links)

In [None]:
def extract_urls_from_approved_links(approved_links, delay_seconds=5):
    extracted_urls = set()

    for link in approved_links:
        status_code, html = get_page(link, delay_seconds)
        if status_code == 200:
            soup = BeautifulSoup(html, bs_parser)
            extracted_urls.update(get_links(soup, link))
            
    return list(extracted_urls)

In [None]:
extracted_links = extract_urls_from_approved_links(approved_links, delay_seconds)

In [None]:
def filtered_extracted_urls(extracted_links):
    filtered_urls = []
    for link in extracted_links:
        url = urlparse(link)
        if not url.hostname.endswith("fairlatterdaysaints.org"):
            continue
        if not url.path.startswith("/answers") and not url.path.startswith("/blog"):
            continue
        if "Category:" in url.path:
            continue
        filtered_urls.append(link)
        print(link)
    return filtered_urls

In [None]:
filtered_urls = filtered_extracted_urls(extracted_links)

In [None]:
for href in approved_links + filtered_urls:
    path_file =  get_path(href)
    print(href, path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(href, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , href)
        continue
    save_page(path_file,href,html)

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
crawl_dir = '../data/raw/fair/'
load_dir = f'../data/load/{source}/'

today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = Loader(load_fairs, crawl_dir, bs_parser)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print("metadata: ", docs[0].metadata)
print()
print("content: ", docs[0].page_content)

In [None]:
save_docs_to_jsonl(docs, load_filename)

## Split

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:25]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
save_docs_to_jsonl(splits, split_filename)