In [None]:
# # #1 Installs
# %pip install langchain_community
# %pip install langchain_experimental
# %pip install chromadb
# %pip install langchain
# %pip install python-dotenv
# %pip install gradio
# %pip uninstall uvloop -y
# %pip install PyPDF2 -q --user
# %pip install rank_bm25

# # VERTEX SPECIFIC:
# # Install Vertex AI SDK for Python
# %pip install --quiet --upgrade google-cloud-aiplatform
# %pip install --upgrade --quiet  langchain-google-genai

# # OPEN AI SPECIFIC:
# %pip install langchain-openai

In [None]:
# # Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [1]:
# Imports
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import chromadb
from dotenv import load_dotenv, find_dotenv
import asyncio
import nest_asyncio
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
nest_asyncio.apply()
import gradio as gr
from PyPDF2 import PdfReader

In [4]:
# # #3 Google API
# get an API key: https://ai.google.dev/gemini-api/docs/api-key
# add it to env.txt

# If using API key for LLM
# # If you cannot use .env, save the file as env and use this code to access:
_ = load_dotenv(dotenv_path='env.txt')

In [5]:
# #4
# VERTEX SPECIFIC
from google.colab import auth
from langchain.llms import VertexAI
from langchain.embeddings import VertexAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

# @markdown Replace the required placeholder text below.
# LLMs/Embeddings models:
embedding_model = "text-embedding-preview-0409" # @param ["text-embedding-004", "text-multilingual-embedding-002","text-embedding-preview-0409", "text-multilingual-embedding-preview-0409", "textembedding-gecko@003", "textembedding-gecko-multilingual@001"]
model = "gemini-pro"  # @param ["text-bison", "gemini-pro"]
embedding_function = VertexAIEmbeddings(model_name=embedding_model)
llm = ChatGoogleGenerativeAI(model=model, temperature=0, max_output_tokens=4096, top_k=1)

  warn_deprecated(


In [None]:
# # OpenAI SPECIFIC
# import openai
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
# openai.api_key = os.environ['OPENAI_API_KEY']

# # LLMs/Embeddings
# # models:
# embedding_model = "text-embedding-ada-002"
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4-turbo-2024-04-09"
# model = "gpt-4o"

# embedding_function = OpenAIEmbeddings(model=embedding_model, openai_api_key=openai.api_key)
# llm = ChatOpenAI(model=model, openai_api_key=openai.api_key, temperature=0.0)

In [None]:
#### INDEXING ####

In [8]:
# #5 Document ingest
# # Load the PDF and extract text
# file at: https://dhbc.ky.gov/Documents/KHBC_BuildingCode.pdf
pdf_path = "KHBC_BuildingCode.pdf"
pdf_reader = PdfReader(pdf_path)

text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [9]:
# Split
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=2000,
    chunk_overlap=500
)
splits = character_splitter.split_text(text)

In [11]:
chroma_client = chromadb.Client()
collection_name = "KentuckyBuildingCode"
dense_documents = [Document(page_content=text, metadata={"id": str(i), "source": "dense"}) for i, text in enumerate(splits)]
sparse_documents = [Document(page_content=text, metadata={"id": str(i), "source": "sparse"}) for i, text in enumerate(splits)]

vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

# Create dense retriever
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# Create sparse retriever
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0, k=10)

In [None]:
#### RETRIEVAL and GENERATION ####

In [12]:
# 6 retrieval and generation
# Prompt
prompt_template = PromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks.

    Use the following pieces of retrieved context to provide a thorough answer to the question.\n

    Question: {question}
    Retrieved Context: {retrieved_context}

    Answer:"""
)

In [13]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
# Chain it all together with LangChain
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=lambda x: format_docs(x["context"]))
    | (lambda x: prompt_template.format(question=x['question'], retrieved_context=x['context']))
    | llm
    | StrOutputParser()
)

In [15]:
rag_chain_similarity = RunnableParallel(
    {"context": dense_retriever,
     "question": RunnablePassthrough()
}).assign(answer=rag_chain_from_docs)

In [16]:
rag_chain_hybrid = RunnableParallel(
    {"context": ensemble_retriever,
     "question": RunnablePassthrough()
}).assign(answer=rag_chain_from_docs, context=lambda x: x["context"][:10])

In [17]:
# user_query = "How does SECTION 104 apply to churches?"# "What are Google's environmental initiatives?"
# user_query = "What is the contruction fee schedule cost per square foot for each occupancy type?"
user_query = "Explain section 104."

In [18]:
# Question - Submitted to the similarity / dense vector search
result_sim = rag_chain_similarity.invoke(user_query)
retrieved_docs_sim = result_sim['context']
print(f"Original Question to Similarity Search: {user_query}\n")
print(f"Final Answer:\n{result_sim['answer']}\n")
print("Doc IDs in order of retrieval:")
doc_ids_sim = [doc.metadata['id'] for doc in retrieved_docs_sim]
print(", ".join(doc_ids_sim))
print("Sources in order of retrieval:")
doc_sources_sim = [doc.metadata['source'] for doc in retrieved_docs_sim]
print(", ".join(doc_sources_sim))
print("\nRetrieved Documents:")
for i, doc in enumerate(retrieved_docs_sim, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}")
    print(f"Content:\n{doc.page_content}\n")

Original Question to Similarity Search: Explain section 104.

Final Answer:


Doc IDs in order of retrieval:
4, 20, 17, 29, 8, 5, 14, 19, 40, 31
Sources in order of retrieval:
dense, dense, dense, dense, dense, dense, dense, dense, dense, dense

Retrieved Documents:
Document 1: Document ID: 4 source: dense
Content:
Section   
101 General  ................................ ................................ ...... 1 
102 Applicability  ................................ ..............................  2 
103 Department of Housing, Buildings and Construction  2 
104 Duties and Powers of Building Official  ....................  3 
105 Permits ................................ ................................ ....... 4 
107 Submittal Documents  ................................ ................  5 
108 Temporary Structures and uses  ................................ . 6 
109 Fees  ................................ ................................ ...........  6 
110 Inspections  ...................

In [19]:
# Question - Submitted to the hybrid / dense+sparse vector search
result_hy = rag_chain_hybrid.invoke(user_query)
retrieved_docs_hy = result_hy['context']
print(f"Original question to hybrid search: {user_query}\n")
print(f"Final Answer:\n{result_hy['answer']}\n")
print("Doc IDs in order of retrieval:")
doc_ids_hy = [doc.metadata['id'] for doc in retrieved_docs_hy]
print(", ".join(doc_ids_hy))
print("Sources in order of retrieval:")
doc_sources_hy = [doc.metadata['source'] for doc in retrieved_docs_hy]
print(", ".join(doc_sources_hy))
print("\nRetrieved Documents:")
for i, doc in enumerate(retrieved_docs_hy, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}")
    print(f"Content:\n{doc.page_content}\n")

Original question to hybrid search: Explain section 104.

Final Answer:
Section 104 of the Kentucky Building Code explains the duties and powers of the building official. The building official is responsible for receiving applications, reviewing construction documents, and issuing permits for the erection, alteration, and moving of buildings and structures. They also inspect the premises for which such permits have been issued and enforce compliance with the provisions of the code. The building official is also responsible for keeping official records of applications received, permits and certificates issued, fees collected, reports of inspections, and notices and orders issued.

Doc IDs in order of retrieval:
4, 229, 20, 232, 17, 228, 29, 85, 8, 233
Sources in order of retrieval:
dense, sparse, dense, sparse, dense, sparse, dense, sparse, dense, sparse

Retrieved Documents:
Document 1: Document ID: 4 source: dense
Content:
Section   
101 General  ................................ .....

#### GOOGLE SIMILARITY SEARCH ONLY
No response.

Doc IDs in order of retrieval:
4, 20, 21, 22, 17, 29, 5, 23, 14, 24

Sources in order of retrieval:
dense, dense, dense, dense, dense, dense, dense, dense, dense, dense



#### GOOGLE HYBRID SEARCH
Section 104 of the Kentucky Building Code outlines the duties and powers of the building official. These duties include enforcing compliance with the provisions of the code, issuing permits, and inspecting premises for which permits have been issued. The building official also has the authority to grant modifications for individual cases, provided that the building official finds that special individual reason makes the strict letter of the code impractical, the modification is in compliance with the intent and purpose of the code, and that such modification does not lessen health, accessibility, life and fire safety or structural requirements.

Doc IDs in order of retrieval:
4, 229, 20, 232, 21, 228, 22, 85, 17, 233

Sources in order of retrieval:
dense, sparse, dense, sparse, dense, sparse, dense, sparse, dense, sparse



In [22]:
# Gradio Interface
def process_question(question):
    result_hy = rag_chain_hybrid.invoke(question)
    final_answer = result_hy['answer']
    sources = [doc.metadata['source'] for doc in result_hy['context']]
    source_list = ", ".join(sources)
    return final_answer, source_list

demo = gr.Interface(
    fn=process_question,
    inputs=gr.Textbox(label="Enter your question", value=user_query),
    outputs=[
        gr.Textbox(label="Final Answer"),
        gr.Textbox(label="Search Source")
    ],
    title="Kentucky Building Code Q&A",
    description="Enter a question about the Kentucky building code and get an answer and the retriever sources."
)

In [23]:
demo.launch(share=True, debug=True) # to add credentials: , auth=("admin", "pass1234")

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://6c2ac5b19fd5ba5302.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6c2ac5b19fd5ba5302.gradio.live




In [None]:
gr.close_all()