In [72]:
import os

import streamlit as st
from langchain.chains import create_retrieval_chain
from PyPDF2 import PdfReader
from langchain.callbacks.base import BaseCallbackHandler
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Neo4jVector
from streamlit.logger import get_logger
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from typing import Dict, List

In [110]:
config={"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        "neo4j_url": "neo4j://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",
        "pdf_path": "data/mcbook-user-guide.pdf",		
        }

In [111]:
# load embedding model
embeddings = OllamaEmbeddings(
    base_url=config["ollama_base_url"],	
    model=config["llm_name"]
)

In [112]:
# load the llm
llm = ChatOllama(
            temperature=0,
            base_url=config["ollama_base_url"],
            model=config["llm_name"],
            streaming=True,
            # seed=2,
            top_k=10,  # A higher value (100) will give more diverse answers, while a lower value (10) will be more conservative.
            top_p=0.3,  # Higher value (0.95) will lead to more diverse text, while a lower value (0.5) will generate more focused text.
            num_ctx=3072,  # Sets the size of the context window used to generate the next token.
        )

## Version 1 begins here.

##### TODO: This functionality can be generalised for different file types (i.e., do a check on the file extension perhaps but that may not work in i.e., Linux)

In [113]:
# load the pdf
pdf_path = config["pdf_path"]
pdf_reader = PdfReader(pdf_path)
print(len(pdf_reader.pages))

76


In [114]:
# prepare textual data from pdf for the model
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

# langchain_textspliter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=len
)


In [115]:
# Splitting into chunks and saving metadata for easier reference / verification of information

# prepare textual data from pdf for the model
text_chunks_with_metadata: List[Dict] = []
for page_num, page in enumerate(pdf_reader.pages, start=1):  # Start from page 1
    page_text = page.extract_text()
    page_chunks = text_splitter.split_text(text=page_text)
    for chunk_num, chunk in enumerate(page_chunks):
        text_chunks_with_metadata.append({
            "text": chunk,
            "page_num": page_num,
            "chunk_num": chunk_num,
            "start_pos": page_text.find(chunk),  # Find starting position within page
            "pdf_path": pdf_path
        })

In [116]:
# Store the chunks part in db (vector)
vectorstore = Neo4jVector.from_texts(
    # Texts represents the new data and will be supplemented with the initial data
    texts=[chunk_data["text"] for chunk_data in text_chunks_with_metadata],
    url=config["neo4j_url"], #database url
    username=config["neo4j_username"], #neo4j username
    password=config["neo4j_password"], #neo4j password
    embedding=embeddings, #embedding model from load_embedding_model
    index_name="pdf_reader", #index name
    node_label="pdfTrial", #node label
    # TODO: REMOVE THIS LINE to retain the vector database
    pre_delete_collection=True,  # Delete existing PDF data
    metadatas=text_chunks_with_metadata  # Pass the metadata list directly
)

In [117]:
retriever = vectorstore.as_retriever()

In [118]:
retriever.invoke("Wifi")

[Document(page_content='3\n3 Problem, Meet Solution\nwww.apple.com/support\nMac Help help', metadata={'pdf_path': 'data/mcbook-user-guide.pdf', 'page_num': 39, 'chunk_num': 0, 'start_pos': 0}),
 Document(page_content='1 \n1  \nReady, Set Up, Go\nwww.apple.com/macbookair\nMac Help Migration Assistant', metadata={'pdf_path': 'data/mcbook-user-guide.pdf', 'page_num': 7, 'chunk_num': 0, 'start_pos': 2}),
 Document(page_content='56 Chapter 3   Problem, Meet SolutionLocating  Your Product  Serial Number\nUse one of these methods to find your computer’s serial number:\nÂTurn your MacBook Air over. The serial number is etched into the case, near the \nhinge. \nÂChoose Apple ( \uf8ff) > About This Mac and then click the version number beneath the \nwords “Mac OS X.” Clicking cycles between the Mac OS X version number, the build version, and the serial number. \nÂOpen System Profiler (in /Applications/Utilities/) and click Hardware.Serial number', metadata={'pdf_path': 'data/mcbook-user-guide.pd

#### Direct querying using a basic RAG workflow (without agents and LangGraph)

In [119]:
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

# chain takes in context and input and returns the output
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = vectorstore.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [120]:
response = retrieval_chain.invoke({"input": "What is the Macbook Command Center?"})
print(response)

{'input': 'What is the Macbook Command Center?', 'context': [Document(page_content='To get Mac Help:\n1Click the Finder icon in the Dock (the bar of icons along the edge of the screen).\n2Click the Help menu in the menu bar and do one of the following:\naType a question or term in the Search field, and select a topic from the returned list \nor select Show All Results to see all topics.\nbChoose Mac Help to open the Mac Help window, where you can click links or type a \nsearch question.', metadata={'pdf_path': 'data/mcbook-user-guide.pdf', 'page_num': 35, 'chunk_num': 1, 'start_pos': 814}), Document(page_content='ÂFor the latest information about Mac OS X, go to www.apple.com/macosx.\nLearning  More, Service,  and Support\nYour MacBook Air does not have any user-serviceable or user-replaceable parts. If you \nneed service, contact Apple or take your MacBook Air to an Apple Authorized Service Provider. You can find more information about the MacBook Air through online resources, onscree

In [121]:
print(response["answer"])

There is no mention of a "MacBook Command Center" in the provided context. The context only discusses various ways to access help and support for the MacBook Air, including using the Finder icon, the Help menu, and online resources.


In [122]:
print(response.keys())

dict_keys(['input', 'context', 'answer'])


In [123]:
# obtain the documents referenced
print(">> Context")
context = response["context"]
print("Context:", context)
print("Context length:", len(context))
print()
print(">> Documents fetched (context, broken down)")
for doc in context:
    print("Document:", doc)
    print("Metadata:", doc.metadata)
    print()

>> Context
Context: [Document(page_content='To get Mac Help:\n1Click the Finder icon in the Dock (the bar of icons along the edge of the screen).\n2Click the Help menu in the menu bar and do one of the following:\naType a question or term in the Search field, and select a topic from the returned list \nor select Show All Results to see all topics.\nbChoose Mac Help to open the Mac Help window, where you can click links or type a \nsearch question.', metadata={'pdf_path': 'data/mcbook-user-guide.pdf', 'page_num': 35, 'chunk_num': 1, 'start_pos': 814}), Document(page_content='ÂFor the latest information about Mac OS X, go to www.apple.com/macosx.\nLearning  More, Service,  and Support\nYour MacBook Air does not have any user-serviceable or user-replaceable parts. If you \nneed service, contact Apple or take your MacBook Air to an Apple Authorized Service Provider. You can find more information about the MacBook Air through online resources, onscreen help, System Profiler, or Apple Hardwa

## Version 2 begins here.

### Determining a good parser instead of PyPDF2 used earlier ^^

##### !! LLM Sherpa is good for PDF, XML, and HTML files.

In [124]:
# testing llm sherpa
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

In [161]:
loader = LLMSherpaFileLoader(
    file_path="data/website.html",
    new_indent_parser=True,
    apply_ocr=True,
    strategy="chunks",
    llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)
docs = loader.load()

In [162]:
from pprint import pprint

In [163]:
pprint(docs)

[Document(page_content='\nSkip to main content', metadata={'source': 'data/website.html', 'chunk_number': 0, 'chunk_type': 'para'}),
 Document(page_content='\nOpen navigation', metadata={'source': 'data/website.html', 'chunk_number': 1, 'chunk_type': 'para'}),
 Document(page_content='\nGo to Reddit Home', metadata={'source': 'data/website.html', 'chunk_number': 2, 'chunk_type': 'para'}),
 Document(page_content='\nr/MachineLearning\n \n \n \n    \n \n \nA chip\nA close button', metadata={'source': 'data/website.html', 'chunk_number': 3, 'chunk_type': 'para'}),
 Document(page_content='\nGet app\nGet the Reddit app', metadata={'source': 'data/website.html', 'chunk_number': 4, 'chunk_type': 'para'}),
 Document(page_content='\nLog In\nLog in to Reddit', metadata={'source': 'data/website.html', 'chunk_number': 5, 'chunk_type': 'para'}),
 Document(page_content='\nOpen settings menu\nLog In / Sign Up\nAdvertise on Reddit\nShop Collectible Avatars', metadata={'source': 'data/website.html', 'chu

In [173]:
pprint(docs[0])

Document(page_content='\nSkip to main content', metadata={'source': 'data/website.html', 'chunk_number': 0, 'chunk_type': 'para'})


##### !! However, for DOCX files, LLMSherpa somehow unable to read it. Instead, we use python-docx 

In [177]:
from docx import Document as wordDocument
from langchain.docstore.document import Document as LangchainDocument

file_name = "data/word.docx"
doc = wordDocument(file_name)
doc_splits = []
para_num = 0
for para in doc.paragraphs:
    # Check if para.text is empty
    if not para.text:
        continue
    
    # convert into langchain document
    # langchain document only has page_content and metadata
    # within metadata, we store the source, chunk_number, and chunk_type
    # para and chunk are used interchangeably here. can be improved.
    langchain_doc = LangchainDocument(page_content=para.text, metadata={"source": file_name, "chunk_number": para_num, "chunk_type": "para"})
    doc_splits.append(langchain_doc)
    para_num += 1

pprint(doc_splits)

[Document(page_content='[Congressional Record Volume 170, Number 41 (Thursday, March 7, 2024)]', metadata={'source': 'data/word.docx', 'chunk_number': 0, 'chunk_type': 'para'}),
 Document(page_content='[Senate]', metadata={'source': 'data/word.docx', 'chunk_number': 1, 'chunk_type': 'para'}),
 Document(page_content='[Pages S2272-S2277]', metadata={'source': 'data/word.docx', 'chunk_number': 2, 'chunk_type': 'para'}),
 Document(page_content='From the Congressional Record Online through the Government Publishing Office [www.gpo.gov]', metadata={'source': 'data/word.docx', 'chunk_number': 3, 'chunk_type': 'para'}),
 Document(page_content='                          PRESIDENTIAL MESSAGE', metadata={'source': 'data/word.docx', 'chunk_number': 4, 'chunk_type': 'para'}),
 Document(page_content='                                 ______', metadata={'source': 'data/word.docx', 'chunk_number': 5, 'chunk_type': 'para'}),
 Document(page_content='                                 ', metadata={'source':

In [179]:
# combine the output of different parsers
# each element is of type langchain document
combined_doc = docs + doc_splits

637


In [180]:
# create a vector store as before
vectorstore = Neo4jVector.from_documents(
    documents=combined_doc,
    url=config["neo4j_url"],
    username=config["neo4j_username"],
    password=config["neo4j_password"],
    embedding=embeddings,
    index_name="parsers_trial",
    node_label="parsersTrial",
    pre_delete_collection=False,
)

In [182]:
retriever = vectorstore.as_retriever()
retriever.invoke("God")

[Document(page_content='\nr/MachineLearning\n \n \n \n    \n \n \nA chip\nA close button', metadata={'chunk_number': 3, 'source': 'data/website.html', 'chunk_type': 'para'}),
 Document(page_content='predecessor to do that. We want competition', metadata={'chunk_number': 552, 'source': 'data/word.docx', 'chunk_type': 'para'}),
 Document(page_content='[Senate]', metadata={'chunk_number': 1, 'source': 'data/word.docx', 'chunk_type': 'para'}),
 Document(page_content='make it fair!', metadata={'chunk_number': 310, 'source': 'data/word.docx', 'chunk_type': 'para'})]

In [189]:
# this uses the document_chain from above
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({"input": "What does it mean to survive in society?"})

In [191]:
print("Response:", response)
print()
print("Answer:", response["answer"])
print()
# We have a list of references to documents that were used to generate the answer
print("Context:", response["context"])

Response: {'input': 'What does it mean to survive in society?', 'context': [Document(page_content="reason I've never been more optimistic about our future!", metadata={'chunk_number': 614, 'source': 'data/word.docx', 'chunk_type': 'para'}), Document(page_content='peril--ban A.I. voice impersonation--and more!', metadata={'chunk_number': 561, 'source': 'data/word.docx', 'chunk_type': 'para'}), Document(page_content='cancer as we know it!', metadata={'chunk_number': 570, 'source': 'data/word.docx', 'chunk_type': 'para'}), Document(page_content='Federal income taxes. Not anymore!', metadata={'chunk_number': 319, 'source': 'data/word.docx', 'chunk_type': 'para'})], 'answer': 'Based on the provided context, it seems that "survive" refers to living with minimal financial burdens, specifically mentioning Federal income taxes. The speaker is expressing optimism about their future and implies that they will no longer have to worry about these financial constraints.'}

Answer: Based on the provi

## Following this, we can define our own data retrieval process using the LangGraph workflow.