In [2]:
import os
from docx import Document as docxDocument
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
config={"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        "neo4j_url": "bolt://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",		
        }

In [4]:
# load embedding model
embeddings = OllamaEmbeddings(
    base_url=config["ollama_base_url"],	
    model=config["llm_name"]
)

In [5]:
def docParser(file_path):
    """
    Parses a document file and returns a split up version of the document.
    Requires file type to be reflected in the file extension.

    Args:
        file_path (str): The path to the document file

    Returns:
        List of LangchainDocument objects
    """

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=20)

    # This assumes that we can tell file type from file extension
    # May not work for linux based systems
    if file_path.endswith('.docx'):
        doc_splits = []
        doc = docxDocument(file_path)
        para_num = 0
        for para in doc.paragraphs:
            if not para.text:
                continue

            langchain_doc_splits = LangchainDocument(
                page_content=para.text,
                metadata={
                    "source": file_path,
                    "chunk_number": para_num,
                    "chunk_type": "para"
                }
            )
            doc_splits.append(langchain_doc_splits)
            para_num += 1

        return doc_splits

        # # Use WordDocumentLoader for docx files
        # loader = UnstructuredWordDocumentLoader(
        #     file_path=file_path,
        #     mode="elements",	
        # )

    else:
        # Use LLMSherpa for all other file types
        # If exception occurs, raise it

        try:
            # LLMSherpa loader (requires container nlm-ingest to be running)
            # nlm-ingest is port forward to 5010
            loader = LLMSherpaFileLoader(
                file_path=file_path,
                new_indent_parser=True,
                apply_ocr=True,
                strategy="text", # this can be "chunks" or "html".
                llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
            )

            docs = loader.load()

            doc_splits = text_splitter.split_documents(docs)

            return doc_splits
            
        except Exception as e:
            print(f"Error: {e}")




In [6]:
# Parsing word document
pprint(docParser("documents/word.docx")[:5])

PackageNotFoundError: Package not found at 'documents/word.docx'

In [160]:
# Parsing html / xml document
docParser("documents/website.html")[:5]

[Document(page_content='Get the Reddit app\n        \nScan this QR code to download the app now\n \n \n \n        Go to MachineLearning\n \n \n \n \nr/MachineLearning\nr/MachineLearning\n \n \n \n        ml.\nBeginners please see learnmachinelearning\n \n \n \n \nMembers\nOnline\n•\nross_prager\n \n \n  ADMIN\n \n \n  MOD\n\n [R] LLM for Word Document Parsing - optimal approach', metadata={'source': 'documents/website.html'}),
 Document(page_content='Research\nHoping to get some help from the community here!\n \n \nI want to automatically and accurately identify headings for a document and the text contained in each heading.\nThe structure of these documents varies, and the headings may even very between documents slightly.\nThe overall goal is to create a high level structure of the document that could then be re-arranged automatically.\n \n \nI am thinking that LLMs might be a good way to do this, but wondering if any one has specific recommendations about how best to approach this.\

In [171]:
# Parsing pdf document
docParser("documents/mcbook-user-guide.pdf")[:5]

Loaded 1 documents
doc: page_content='Congratulations, you and your MacBook Air were made for each other.\nBuilt-in iSight camera\nVideo chat with up to three friends anywhere in the world at the same time.\nwww.apple.com/macbookair Mac Help isight\n | Finder | Time Machine\n | --- | ---\n | Browse the contents of your computer using Cover Flow. | Automatically back up your files to an extra hard drive.\n\nwww.apple.com/macosx Mac Help finder www.apple.com/macosx Mac Help time machine\nGarageBand\nCreate music by adding musicians to a virtual stage.\nEnhance your song to sound like a pro.\nwww.apple.com/ilife/garageband GarageBand Help record iPhoto Help photo\nwww.apple.com/ilife/iphoto\niWeb\nCreate beautiful websites with photos, movies, blogs, podcasts, and dynamic web widgets.\nwww.apple.com/ilife/iweb iWeb Help website\niPhoto\nOrganize all your photos with Events.\nPublish to a Web Gallery with a click.\niMovie\nCollect all your video in one library.\nCreate and share movies in 

[Document(page_content='Congratulations, you and your MacBook Air were made for each other.\nBuilt-in iSight camera\nVideo chat with up to three friends anywhere in the world at the same time.\nwww.apple.com/macbookair Mac Help isight\n | Finder | Time Machine\n | --- | ---\n | Browse the contents of your computer using Cover Flow. | Automatically back up your files to an extra hard drive.', metadata={'source': 'documents/mcbook-user-guide.pdf'}),
 Document(page_content='www.apple.com/macosx Mac Help finder www.apple.com/macosx Mac Help time machine\nGarageBand\nCreate music by adding musicians to a virtual stage.\nEnhance your song to sound like a pro.\nwww.apple.com/ilife/garageband GarageBand Help record iPhoto Help photo\nwww.apple.com/ilife/iphoto\niWeb\nCreate beautiful websites with photos, movies, blogs, podcasts, and dynamic web widgets.\nwww.apple.com/ilife/iweb iWeb Help website\niPhoto\nOrganize all your photos with Events.\nPublish to a Web Gallery with a click.\niMovie\nC

#### Using doc_splits as retriever

In [7]:
# loop through each document in the data folder
# and run each document through the docParser function

combined_doc_splits = []

for doc in os.listdir("documents"):
    doc_path = os.path.join("documents", doc)
    print(f"Processing {doc_path}")
    doc_splits = docParser(doc_path)
    print(f"Number of splits: {len(doc_splits)}")
    print("\n")
    combined_doc_splits.extend(doc_splits)

combined_doc_splits[:5]

Processing documents/TREC-2017-LiveQA-Medical-Train-1.xml
Number of splits: 4088


Processing documents/website.html
Number of splits: 4


Processing documents/mcbook-user-guide.pdf
Number of splits: 258




[Document(page_content='Nl Question Q1 11373\nMessage\nLiterature on Cardiac amyloidosis.\nPlease let me know where I can get literature on Cardiac amyloidosis.\nMy uncle died yesterday from this disorder.\nSince this is such a rare disorder, and to honor his memory, I would like to distribute literature at his funeral service.\nI am a retired NIH employee, so I am familiar with the campus in case you have literature at NIH that I can come and pick up.\nThank you\nSu Questions\nSu Question Q1-S1\nAnnotations\nFocus\ncardiac amyloidosis\nType\ninformation\nAnswers\nAnswer Q1-S1-A1 1\nCardiac amyloidosis is a disorder caused by deposits of an abnormal protein (amyloid) in the heart tissue.\nThese deposits make it hard for the heart to work properly.\nAnswer Q1-S1-A2 2', metadata={'source': 'documents/TREC-2017-LiveQA-Medical-Train-1.xml'}),
 Document(page_content='Answer Q1-S1-A2 2\nThe term "amyloidosis" refers not to a single disease but to a collection of diseases in which a protein-b

In [8]:
# stores the parsed documents in the Neo4j database
vectorstore = Neo4jVector.from_documents(
    documents=combined_doc_splits,
    url=config["neo4j_url"],
    username=config["neo4j_username"],
    password=config["neo4j_password"],
    embedding=embeddings,
    index_name="parsers_trial_2",
    node_label="parsersTrial2",
    pre_delete_collection=True,
)

In [114]:
docParser("documents/website.html")

[Document(page_content='Get the Reddit app\n        \nScan this QR code to download the app now\n \n \n \n        Go to MachineLearning\n \n \n \n \nr/MachineLearning\nr/MachineLearning\n \n \n \n        ml.\nBeginners please see learnmachinelearning\n \n \n \n \nMembers\nOnline\n•\nross_prager\n \n \n  ADMIN\n \n \n  MOD\n\n [R] LLM for Word Document Parsing - optimal approach', metadata={'source': 'documents/website.html'}),
 Document(page_content='Research\nHoping to get some help from the community here!\n \n \nI want to automatically and accurately identify headings for a document and the text contained in each heading.\nThe structure of these documents varies, and the headings may even very between documents slightly.\nThe overall goal is to create a high level structure of the document that could then be re-arranged automatically.\n \n \nI am thinking that LLMs might be a good way to do this, but wondering if any one has specific recommendations about how best to approach this.\