In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from langchain.document_loaders import PyPDFLoader
from datetime import datetime
from langchain.schema.document import Document
from models.load_utils import save_docs_to_jsonl
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
import os
import urllib.request

In [None]:
# configure
pdfs = [
("https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/smoot/2021-12-25/pgp_study_edition_stephen_o_smoot.pdf", "The Pearl of Great Price: A Study Edition for Latter-day Saints"),
]

path_dir  = '../data/raw/pdfs/'
output_dir = '../data/split/pdfs/'

chunk_size = 2000
chunk_overlap = 200
length_function = len
today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [None]:
for pdf_file in pdfs:
    print(pdf_file[0])
    response = urllib.request.urlretrieve(pdf_file[0], path_dir + pdf_file[1].replace(' ','-').lower() + '.pdf')

    print(f"Downloaded {pdf_file[0]} to {path_dir + pdf_file[1].replace(' ','-').lower() + '.pdf'}")

In [None]:
file_list = os.scandir(path_dir)
print("Files in '% s':" % path_dir)
all_pages = []
for entry in file_list:
    if entry.is_file():
        print(path_dir + entry.name)
        loader = PyPDFLoader(path_dir + entry.name)
        pages = loader.load_and_split()
        all_pages.extend(pages)
 
 
file_list.close()

In [None]:
all_pages

In [None]:
def load_pages(pages, verbose: bool = False) -> Document:
    """Load dc people from a url and html."""

    docs = []
    for page in pages:
    
        offset = 6
        page_info = ""
        if page.metadata["page"] < offset:
            page_info = "Pearl of Great Price Study Edition page"
        else:
            content = page.page_content
            pager = page.metadata["page"] - offset
            page_info = "Pearl of Great Price Study Edition page "+str(pager)
            metadata = {
                "title": page_info,
                "source": page.metadata["source"]
            }
            
            doc = Document(page_content=content, metadata=metadata)
           
            if not doc.metadata["title"] or not doc.page_content:
                if verbose:
                    print("Missing title or content - skipping", metadata["source"])
                    continue
            docs.append(doc)

    print(docs)
    return docs

In [None]:
docs = load_pages(all_pages, True)
len(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
    Language.MARKDOWN,
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function=length_function,
)

In [None]:
splits = text_splitter.split_documents(docs)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["source"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)