In [6]:
!pip install langchain-text-splitters



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import os 
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
     

In [11]:
from urllib.request import urlretrieve

In [12]:
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
os.makedirs("us_census", exist_ok=True)

In [15]:
import os
import urllib.request

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as response, open(file_path, 'wb') as out_file:
        out_file.write(response.read())

In [16]:
loader = PyPDFDirectoryLoader("./us_census/")

In [18]:
docs_before_split = loader.load()

In [19]:
docs_before_split[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'us_census/acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, w

In [20]:
text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size =700,
    chunk_overlap = 50
)
docs_after_split = text_splitter.split_documents(docs_before_split)


In [21]:
len(docs_after_split[0].page_content)

696

In [22]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)

In [23]:
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

In [24]:
print(f'before split: {avg_char_before_split}')
print(f'after split: {avg_char_after_split}')

before split: 3840
after split: 624


In [27]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name= "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {'device' : 'cpu'},
    encode_kwargs = {'normalize_embeddings' : True}
)

In [28]:
huggingface_embeddings.embed_query("Hello world!")

[0.004552517086267471,
 0.17278987169265747,
 0.03477637469768524,
 0.005749532952904701,
 -0.02632882632315159,
 -0.04085705056786537,
 0.02265790104866028,
 -0.04465893656015396,
 -0.01880367286503315,
 0.008832558058202267,
 0.040252573788166046,
 -0.03470901399850845,
 0.015151005238294601,
 -0.01465737447142601,
 0.07468860596418381,
 -0.043932680040597916,
 -0.05603860691189766,
 0.020306672900915146,
 -0.05812905728816986,
 -0.04611074551939964,
 0.08254799991846085,
 0.10951980948448181,
 0.014446157030761242,
 0.025379348546266556,
 -0.08089369535446167,
 0.01504391711205244,
 -0.003515387186780572,
 0.01302985567599535,
 0.09713467210531235,
 -0.061469580978155136,
 -0.027697794139385223,
 0.0014511918416246772,
 0.08353216201066971,
 0.017021698877215385,
 -0.010031245648860931,
 0.08817801624536514,
 0.05356217548251152,
 -0.039075642824172974,
 0.02845214121043682,
 -0.05871637910604477,
 0.0240711010992527,
 -0.03864147886633873,
 -0.04744642972946167,
 0.0030063327867537

In [30]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [31]:
import os 
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
     

In [32]:
import os 
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
     

In [58]:
query = "What are the trends in median household income across different states in the united states between 2021 and 2022?"


In [34]:
relevant_documents = vectorstore.similarity_search(query)

In [35]:
retriever = vectorstore.as_retriever(search_type="similarity" , search_kwargs={"k" : 3})

In [None]:
access_token = "xxxx"

In [38]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-v0.1',
    model_kwargs = {"temperature" : 0.1 , "max_length" : 500},huggingfacehub_api_token=access_token
)

In [None]:
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "xxxxxxxxxxxxxx"
    from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-3-flash-preview",
    temperature=1.0,  # Gemini 3.0+ defaults to 1.0
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)



In [52]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = model.invoke(messages)
ai_msg

AIMessage(content=[{'type': 'text', 'text': "J'adore la programmation.", 'extras': {'signature': 'EqcGCqQGAXLI2nwVdnqZgRhT2Ialx+JZhVVIUkr9VtIk421BLED3E30x/dcJ6cTVszyC5/KIAMN439wTCLgCbb9t5t8EvlIvu41HuiULGYJ18ja7GNCtao1NYvaTOP6XKHMIlJt2I82xHp6DkmhTo+fD1XFLiXllLcj1UB8Yb7vQKWARLaetWuf+enCnM9q1w98Muz5F/vgOpB39yEjV87bMxyQ/xjAYgSIBXYenIFn0AB7DRjHW8gMWq7yBFOvtvxCxQpPTlrD5+xxj6QhygRBXFB7HQ7+inOl0hOy/9hLJe4/0NRtzslIUEw4XGZnINGvRtyVQjyNFogUbUunMZOpqeMS6u18NBNDyNCgazGChaIcIcDhuo+IgZxXuVzSO6HNnHIapV8uQKbEJl4jknOc6zrNHtlXCns9nIxdivYzzMC7EXMjm+1NC9iEQAYgTp6IRMoVw6WrriVT7n9Fqd5kO3iEQv4k9kiaIrPXrIHYZV0hLCXZ635rKhzru7x6s1ATE41pcct4FJ5Hddw0K50TaOlLQCh6pyAv3GWBbTzQFIszWSAQDnOxidqc5eJWbEo2CI8z243dz2HhNJpQC6B4UXaAhfDSbGdTyNbpc7RC7ZKKL2PUFcopyGb4+9wQznm7Y2lx5ewJ4BeVWXmINVG+ZUWwqJrOcgtYdOjBJKoHHmsa+c5Mbu6DxgLdjpunnLnSOocrDLs9XBfkrnPNnt++B6jHUWdl1Lz69dUnUFsRrGgql1XT2PfvDbGhjDsKDxB42lZd1Cmlk4FfGOqk8W8kpWWxRlikPRALEV3dosn1EtCTf1O+ny+6lg6hwCBWBijMhOOM3XrZ0ePcUxhSLwZ/joboNdRyzFJvWeYQ8V5nK/yi13YscIuwiVWir5fLi1pQIW5

In [59]:
output = model.invoke(query)
print(output)

content=[{'type': 'text', 'text': 'The transition between 2021 and 2022 was a unique period in the U.S. economy, characterized by a strong labor market and rising wages, but heavily offset by the highest inflation rates seen in 40 years.\n\nBased on data from the **U.S. Census Bureau (American Community Survey and Current Population Survey)**, here are the key trends in median household income across the states during that period.\n\n### 1. The "Real" vs. "Nominal" Gap\nThe most significant trend during this period was the gap between **nominal income** (the actual dollar amount on a paycheck) and **real income** (purchasing power adjusted for inflation).\n*   **Nominal Growth:** In almost every state, median household incomes rose in nominal terms as employers hiked wages to attract workers during the "Great Resignation."\n*   **Real Decline:** However, when adjusted for the **7.8%–8% inflation rate** seen in 2022, the national real median household income actually **fell by about 2.3

In [53]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)
     

In [54]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=model, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

### USE THE RAG !!! 

In [55]:
result = retrievalQA.invoke({"query" : query})
print(result)

{'query': 'What are the trends in median household income across different states in the united states between 2021 and 2022?', 'result': 'Between 2021 and 2022, five states—Alabama, Alaska, Delaware, Florida, and Utah—showed a statistically significant increase in real median household income. In contrast, 17 states experienced a decrease in real median household income during this period. For 28 states, the District of Columbia, and Puerto Rico, the median household income in 2022 was not statistically different from 2021. Nationally, the U.S. median household income was $74,755 in 2022, representing a 0.8 percent decline from the previous year after inflation adjustments.', 'source_documents': [Document(id='17b1554b-bd95-4d8a-af78-87ebf063b643', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'm

In [56]:
print(result.keys())

dict_keys(['query', 'result', 'source_documents'])


In [57]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census/acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
---------------------------------------------------------------------------------