In [44]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub
import os
import numpy as np
from dotenv import load_dotenv
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [45]:
# Load and process documents
loader = PyPDFDirectoryLoader("./us_census")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)

print(f"Loaded {len(final_documents)} document chunks")
print("First document preview:")
print(final_documents[0].page_content[:200] + "...")

Loaded 316 document chunks
First document preview:
Occupation, Earnings, and Job 
Characteristics
July 2022
P70-178
Clayton Gumber and Briana Sullivan
Current Population Reports
INTRODUCTION
Work is a critical component of our lives and provides 
a wa...


In [46]:
len(final_documents)

316

In [47]:
# Create embeddings
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

In [48]:
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content))

array([ 4.36537340e-02,  2.47138254e-02, -3.53545486e-03, -3.35805677e-02,
        2.86745783e-02,  3.59407738e-02,  5.79464138e-02,  1.38196014e-02,
       -1.00746090e-02, -5.16765751e-02, -2.07865424e-02, -2.06089001e-02,
       -5.45908734e-02,  1.48892449e-02, -5.10704294e-02,  1.23965926e-02,
        3.92401628e-02,  2.22907718e-02, -1.73835177e-02, -2.38548368e-02,
       -1.27717750e-02,  1.69856735e-02,  4.96503785e-02,  5.67325428e-02,
        8.19455683e-02, -4.70340177e-02, -2.89776810e-02,  8.35138280e-03,
       -3.31679806e-02,  3.59652713e-02, -2.63564549e-02, -9.66333173e-05,
        3.29935588e-02,  2.79442091e-02, -1.86275765e-02, -1.60721615e-02,
       -2.26744115e-02, -4.19455916e-02,  2.85237171e-02,  1.36059579e-02,
       -3.96715254e-02, -3.56036536e-02, -1.16034662e-02, -8.01337324e-03,
       -3.26603316e-02, -1.68721266e-02, -2.85104830e-02, -7.60240434e-03,
       -2.21429486e-02,  2.34827437e-02, -4.21315767e-02, -5.30382793e-04,
        2.46101599e-02, -

In [49]:
# Create vector store
vectorstore = FAISS.from_documents(final_documents[:120], huggingface_embeddings)

In [50]:
# Test similarity search
query = "WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_documents = vectorstore.similarity_search(query)
print("Relevant document found:")
print(relevant_documents[0].page_content[:300] + "...")

Relevant document found:
18 U.S. Census Bureau
the percentage of workers with 
employer-provided health insur-
ance coverage varied between 
70 and 94 percent depending 
on their reported occupation. 
In addition, selected occupa -
tions had especially high rates of 
coverage through another person. 
Food preparation and se...


In [51]:
# Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [52]:
load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ['HUGGINGFACEHUB_API_TOKEN']=os.getenv("HUGGINGFACE_API_KEY")

In [53]:
hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",
    model_kwargs={
        "temperature":0.1,
        "max_new_tokens":500,
        "return_full_text":False
    }
)

In [54]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
"""

In [55]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [56]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [57]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:
result = retrievalQA.invoke({"query": query})
print(result['result'])