In [65]:
from langchain_community.document_loaders import PyPDFLoader
loaded_pdf = PyPDFLoader('output.pdf').load_and_split()

In [66]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=1000,
    separators=["\n\n", "\n", " ", ""],)

documents = text_splitter.split_documents(loaded_pdf)
documents

[Document(metadata={'producer': 'PDFKit', 'creator': 'PDFKit', 'creationdate': '2025-05-19T12:55:43+00:00', 'source': 'output.pdf', 'total_pages': 69, 'page': 0, 'page_label': '1'}, page_content='Comprehensive Order Report\nThis report provides a detailed overview of selected orders, including statuses, client \ninformation, and key shipment milestones. Each subsequent page contains information \nfor an individual order to facilitate review and record-keeping.'),
 Document(metadata={'producer': 'PDFKit', 'creator': 'PDFKit', 'creationdate': '2025-05-19T12:55:43+00:00', 'source': 'output.pdf', 'total_pages': 69, 'page': 1, 'page_label': '2'}, page_content='The order Information for Order ID (65cdf655acb24b003b673350)\n- Order ID: 65cdf655acb24b003b673350\n- Status: CANCELLED\n- Contract Type: undefined\n- Contract Approved: No\n- Created By: Joseph Stazzone from Cafe Kreyol, LLC (Client ID: \n65c22c47b062b6003c932497)\n- Order item of the order id (65cdf655acb24b003b673350) are the foll

In [67]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large") 

In [68]:
from langchain.vectorstores import Chroma
db= Chroma.from_documents(documents, embeddings, persist_directory="chroma_db")


In [81]:
doc_retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 8,        # return top 8 to the LLM
    })

In [82]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
   ( """
    You are a helpful assistant. Answer the following question based on the provided context.
    The context revolves on an order data
    <context>
    {context}
    </context>
    Question: {input} 
    """)
)

In [77]:
from langchain_ollama import OllamaLLM
ollama = OllamaLLM(model="deepseek-r1:7b")

In [83]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chains = create_stuff_documents_chain(ollama, prompt)

In [85]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(doc_retriever, document_chains)
result = retrieval_chain.invoke({"input": "Can you tell me who is the owner of the order id number of 67ff3968ffa3e23536247a15"})
result

{'input': 'Can you tell me who is the owner of the order id number of 67ff3968ffa3e23536247a15',
 'context': [Document(metadata={'creator': 'PDFKit', 'page_label': '58', 'producer': 'PDFKit', 'source': 'output.pdf', 'total_pages': 69, 'creationdate': '2025-05-19T12:55:43+00:00', 'page': 57}, page_content='The order Information for Order ID (67fcbb158c01970d0b2427af)\n- Order ID: 67fcbb158c01970d0b2427af\n- Status: PENDING\n- Contract Type: undefined\n- Contract Approved: No\n- Created By: Leul Habte from Addissoftware (Client ID: 65a6288ddf1668003bbe3354)\n- Order item of the order id (67fcbb158c01970d0b2427af) are the following: [\n1. Order Item ID: 67fcbb158c01970d0b2427bd\n - Sample ID: 67c02e033ad54c001c8ed57b\n - Status: PENDING\n - Client Desired Amount: undefined\n - Client Desired Price: undefined\n - Price: undefined\n - Total Amount: undefined\n - Key Dates:\n   - ETA to Mill: undefined\n   - ETA at Mill Process: undefined\n   - ETA from Mill to Exporting Partner: undefined\n

In [90]:
orders = [
    {"orderItems": [101, 102]},
    {"orderItems": [103]},
    {"orderItems": [104, 105]}
]

item_ids = []
for order in orders:
    for item_id in order["orderItems"]:
        item_ids.append(item_id)

item_ids = [item_id for order in orders for item_id in order["orderItems"]]
item_ids

[101, 102, 103, 104, 105]