In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from langchain.document_loaders import PyPDFLoader
from datetime import datetime
from langchain.schema.document import Document
from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_markdown import RecursiveMarkdownTextSplitter
import os

In [None]:
path  = '../data/raw/pearl_study/pgp_study_edition_stephen_o_smoot.pdf'
output_dir = '../data/load/pearl_study/'

today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)



In [None]:
loader = PyPDFLoader(path)
pages = loader.load_and_split()

In [None]:
pages



In [None]:
output_filename = os.path.join(output_dir, f"{today}.jsonl")

save_docs_to_jsonl(pages, output_filename)

In [None]:
# configure
input_path = '../data/load/pearl_study/2023-11-08.jsonl'
chunk_size = 2000
chunk_overlap = 200
output_dir_split = '../data/split/pearl_study/'
today = datetime.today().strftime('%Y-%m-%d')
if not os.path.exists(output_dir_split):
    os.makedirs(output_dir_split)

In [None]:
def load_pages(pages, verbose: bool = False) -> Document:
    """Load dc people from a url and html."""

    docs = []
    for page in pages:
    
        offset = 6
        page_info = ""
        if page.metadata["page"] < offset:
            page_info = "Pearl of Great Price Study Edition page"
        else:
            content = page.page_content
            pager = page.metadata["page"] - offset
            page_info = "Pearl of Great Price Study Edition page "+str(pager)
            metadata = {
                "title": page_info,
                "source": page.metadata["source"]
            }
            
            doc = Document(page_content=content, metadata=metadata)
           
            if not doc.metadata["title"] or not doc.page_content:
                if verbose:
                    print("Missing title or content - skipping", metadata["source"])
                    continue
            docs.append(doc)

    print(docs)
    return docs

In [None]:
docs = load_docs_from_jsonl(input_path)
docs = load_pages(docs, True)
len(docs)

In [None]:
text_splitter = RecursiveMarkdownTextSplitter(
    title_header_separator=" / ",
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["source"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
filename = os.path.join(output_dir_split, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)