In [6]:
## 1. Ingest PDF Files
# 2. Extract Text from PDF Files and split into small chunks
# 3. Send the chunks to the embedding model
# 4. Save the embeddings to a vector database
# 5. Perform similarity search on the vector database to find similar documents
# 6. retrieve the similar documents and present them to the user
## run pip install -r requirements.txt to install the required packages

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

doc_path = "./data/BOI.pdf"
model = "llama3.2"

# Local PDF file uploads
if doc_path:
    loader = UnstructuredPDFLoader(file_path=doc_path)
    data = loader.load()
    print("done loading....")
else:
    print("Upload a PDF file")

    # Preview first page
content = data[0].page_content
print(content[:100])

done loading....
Beneficial Ownership Information Report

Filing Instructions

Financial Crimes Enforcement Network




In [7]:
# ==== Extract Text from PDF Files and Split into Small Chunks ====

from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

text_splitter= RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=300)
chunks=text_splitter.split_documents(data)
print("done splitting....")

# print(f"Number of chunks: {len(chunks)}")
# print(f"Example chunk: {chunks[0]}")

done splitting....


In [8]:
# Add to vector database
import ollama

embedding_model="mxbai-embed-large"

vector_data=Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model=embedding_model),
    collection_name="simple-rag"
)

print("done embedding....")

done embedding....


In [9]:
# Reterieval
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_ollama import ChatOllama

from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Set up our model for use
llm = ChatOllama(model=model)

# a simple technique to generate multiple questions from a single question and then retrieve documents
# based on those questions, getting the best of both worlds.
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)


retriever= MultiQueryRetriever.from_llm(
    vector_data.as_retriever(),llm,prompt=QUERY_PROMPT
)


In [10]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt=ChatPromptTemplate.from_template(template)

chain=(
    {"context":retriever,"question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


# res = chain.invoke(input=("what is the document about?",))
res = chain.invoke(
    input=("what are the main points as a business owner I should be aware of?",)
)
# res = chain.invoke(input=("how to report BOI?",))

print(res)


Based on the provided text, here are some key points that a business owner should be aware of:

1. **Identifying Document Requirements**: Businesses must attach identifying document images to their BOIRs (Beneficial Ownership Information Reports).
2. **Prohibited Words and Phrases**: Certain words and phrases, such as "AKA", "DBA", "NMN", "NONE", and "NOT APPLICABLE", should not be used in text fields of the BOIR.
3. **Business Address vs. Residential Address**: Business owners must select either "24a" (business address) or "24b" (residential address) when reporting their current address on the BOIR.
4. **Date of Birth Format**: The date of birth should be reported in the format MM/DD/YYYY, with month, day, and year provided.
5. **Middle Name and Suffix Requirements**: Business owners must report their middle name and suffix (if applicable) for the company applicant's legal name.
6. **Last Name and First Name Requirements**: Business owners must report the company applicant's last name