In [86]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
from langchain.document_loaders import PyPDFLoader
from datetime import datetime
from langchain.schema.document import Document
from models.load_utils import save_docs_to_jsonl
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
import os
import urllib.request

In [88]:
# configure
pdfs = [
("https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/smoot/2021-12-25/pgp_study_edition_stephen_o_smoot.pdf", "The Pearl of Great Price: A Study Edition for Latter-day Saints"),
]

path_dir  = '../data/raw/pdfs/'
output_dir = '../data/split/pdfs/'

chunk_size = 2000
chunk_overlap = 200
length_function = len
today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [89]:
for pdf_file in pdfs:
    print(pdf_file[0])
    response = urllib.request.urlretrieve(pdf_file[0], path_dir + pdf_file[1].replace(' ','-').lower() + '.pdf')

    print(f"Downloaded {pdf_file[0]} to {path_dir + pdf_file[1].replace(' ','-').lower() + '.pdf'}")

https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/smoot/2021-12-25/pgp_study_edition_stephen_o_smoot.pdf
Downloaded https://archive.bookofmormoncentral.org/sites/default/files/archive-files/pdf/smoot/2021-12-25/pgp_study_edition_stephen_o_smoot.pdf to ../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf


In [90]:
file_list = os.scandir(path_dir)
print("Files in '% s':" % path_dir)
all_pages = []
for entry in file_list:
    if entry.is_file():
        print(path_dir + entry.name)
        loader = PyPDFLoader(path_dir + entry.name)
        pages = loader.load_and_split()
        all_pages.extend(pages)
 
 
file_list.close()

Files in '../data/raw/pdfs/':
../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf


In [91]:
all_pages

[Document(page_content='The Pearl of \nGreat Price\nA Study Edition for Latter-day Saints\nStephen O. Smoot', metadata={'source': '../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf', 'page': 0}),
 Document(page_content='The Pearl of \nGreat Price\nA Study Edition for Latter-day Saints\nStephen O. Smoot\nA publication of Book of Mormon Central, Pearl of Great Price Central, \nScripturePlus, and Bible Central', metadata={'source': '../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf', 'page': 2}),
 Document(page_content='Cover design by Jasmin Gimenez Rappleye.\nPrepared for publication by Book of Mormon Central\nP .O. Box 1538\nAmerican Fork, UT 84003—6406\n© 2022 ScripturePlus and Book of Mormon Central.\nAll rights reserved. Published 2022.\nPrinted in the United States of America.', metadata={'source': '../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf', 'page': 3}),
 Document(page_

In [92]:
def load_pages(pages, verbose: bool = False) -> Document:
    """Load dc people from a url and html."""

    docs = []
    for page in pages:
    
        offset = 6
        page_info = ""
        if page.metadata["page"] < offset:
            page_info = "Pearl of Great Price Study Edition page"
        else:
            content = page.page_content
            pager = page.metadata["page"] - offset
            page_info = "Pearl of Great Price Study Edition page "+str(pager)
            metadata = {
                "title": page_info,
                "source": page.metadata["source"]
            }
            
            doc = Document(page_content=content, metadata=metadata)
           
            if not doc.metadata["title"] or not doc.page_content:
                if verbose:
                    print("Missing title or content - skipping", metadata["source"])
                    continue
            docs.append(doc)

    print(docs)
    return docs

In [93]:
docs = load_pages(all_pages, True)
len(docs)



182

In [94]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
    Language.MARKDOWN,
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function=length_function,
)

In [95]:
splits = text_splitter.split_documents(docs)
len(splits)

338

In [96]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["source"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

0 ../data/raw/pdfs/the-pearl-of-great-price:-a-study-edition-for-latter-day-saints.pdf Pearl of Great Price Study Edition page 0
Introduction   |   i 
Introduction
On July 15, 1851, Elder Franklin D. Richards of the Quorum of the Twelve 
Apostles of The Church of Jesus Christ of Latter-day Saints published a short 
notice in the Church’s European newspaper, The Latter-day Saints’ Millennial 
Star, announcing the publication of “a new work which will soon be ready for 
sale. ” This work was expected to be “a source of much instruction and edifi -
cation to many thousands of the Saints. ” The name Elder Richards gave this 
new volume was the Pearl of Great Price, taking his inspiration from Jesus’s 
parable in Matthew 13:45–46. 
Writing from Liverpool, England, Elder Richards informed readers in 
the preface to the first edition of the Pearl of Great Price that he felt the vol -
ume was necessary because of “repeated solicitations of several friends of 
the publisher, who are desirous to

In [97]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)