In [5]:
from langchain_community.embeddings import OllamaEmbeddings,HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader,PyPDFLoader,PyPDFDirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import Ollama

import time
import json
from pathlib import Path



In [3]:
loader=PyPDFDirectoryLoader("./us_census")
documents=loader.load()
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
document_final=text_splitter.split_documents(documents)

document_final[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'us_census/acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.\n1 Public policy changes included')

In [4]:
len(document_final)

566

In [8]:
hugging_face_emb=HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# just for testing
emb_docs=hugging_face_emb.embed_documents(document_final[0].page_content)

# embedding all documents in vector store db 
vector_store=FAISS.from_documents(document_final,hugging_face_emb)

In [32]:
# query the vector store db
query="median household income of New Jersey from 2021 to 2022 was it increased or decreased by how much. give me the percentage change and good explanation like you are explaining to child"
docs=vector_store.similarity_search(query,k=3)
print(docs[0].page_content)

for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
Median Household Income in the Past 12 Months


In [33]:
retrieval=vector_store.as_retriever()

llm=Ollama(model="Gemma3:1b",base_url="http://host.docker.internal:11434",verbose=True)
qa_chain=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=retrieval)
result=qa_chain({"query":query})
print(result["result"])

Okay, let's break down the change in median household income for New Jersey from 2021 to 2022.

The median household income in New Jersey decreased by about 0.8% from 2021 to 2022.

It’s like if you take 0.8 of a point off a number and you’re looking at the whole thing, it means the income is a little bit lower.

**Explanation for a Child:**

Imagine you have a group of kids and you're all counting how much each kid earns.  In 2021, some kids earned a little more than in 2022.  Because there were fewer kids earning more, the total amount of money earned in New Jersey was a little bit less in 2022 than in 2021.  That's what the percentage change means.

**Important Note:** The text also tells us that there were *no* significant differences in median household income between New Jersey and the other states, the District of Columbia, and Puerto Rico.  So, the change was pretty small in New Jersey.

Do you have any other questions about this information?
