# Crawl, load, and split PDFs

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os
import re

from langchain.document_loaders import UnstructuredFileLoader
from langchain.schema.document import Document
import urllib.request

from models.load_utils import create_pages_from_unstructured_elements, save_docs_to_jsonl
from models.split_model import SyntacticEmbeddingSplitter

In [None]:
# configure
# url, title, first page
pdfs = [
    ("https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/smoot/2021-12-25/pgp_study_edition_stephen_o_smoot.pdf",
     "The Pearl of Great Price: A Study Edition for Latter-day Saints",
     7),
    ("https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/bennett/2019-03-04/jim_bennett_a_faithful_reply_to_the_ces_letter_from_a_former_ces_employee_2018.pdf",
     "A CES Letter Reply: Faithful Answers For Those Who Doubt",
    9),
    ("https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/ash/2016-08-10/bamboozled-by-the-ces-letter-final1.pdf",
     "Bamboozled by the CES Letter",
    12),
    ('https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/welch/2020-01-10/jww_notes_combined.pdf',
     'John W. Welch Notes - Come Follow Me',
     1),
]

path_dir  = '../data/raw/pdfs/'
output_dir = '../data/split/pdfs/'

today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [None]:
def get_path(dir, title):
    return os.path.join(dir, re.sub('[^a-zA-Z0-9]', '-', title).lower()+'.pdf')

In [None]:
text_splitter = SyntacticEmbeddingSplitter()

In [None]:
docs = []
for url, title, first_page in pdfs:
    path = get_path(path_dir, title)
    print(f"Processing {url} {path}")
    # download file if it doesn't exist
    if not os.path.exists(path):
        response = urllib.request.urlretrieve(url, path)
        print(response)
        print(f"Downloaded {url}")
    # load file
    loader = UnstructuredFileLoader(path, mode="elements")
    elements = loader.load()
    # create pages
    pages = create_pages_from_unstructured_elements(elements, title, first_page, url)
    print(f"Processed {len(pages)} pages")
    # split pages
    splits = text_splitter.split_documents(pages, verbose=True)
    print(f"Created {len(splits)} splits")
    for ix, split in enumerate(splits[:10]):
        print(ix, split.metadata["url"], split.metadata["title"])
        print(split.page_content)
        print("\n!!! SPLIT !!!\n")
    docs.extend(splits)

In [None]:
len(docs)

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(docs, filename)