In [None]:
import os 
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
     

In [12]:
from urllib.request import urlretrieve

In [13]:
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
os.makedirs("us_census", exist_ok=True)

In [14]:
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [15]:
loader = PyPDFDirectoryLoader("./us_census/")

In [16]:
docs_before_split = loader.load()

In [22]:
docs_before_split[0]

Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under \nthe Continuous Enrollment Provision.2 The American \nRescue Plan (ARP) enhanced Marketplace premium \nsubsidies for those with incomes above 400 percent \nof the poverty level as well as for unemployed people.3\nIn addition to national policies, individual states and \nthe District of Columbia can affect h

In [23]:
text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size =700,
    chunk_overlap = 50
)
docs_after_split = text_splitter.split_documents(docs_before_split)


In [25]:
len(docs_after_split[0].page_content)

694

In [26]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)

In [28]:
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

In [29]:
print(f'before split: {avg_char_before_split}')
print(f'after split: {avg_char_after_split}')

before split: 3840
after split: 624


In [31]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name= "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {'device' : 'cpu'},
    encode_kwargs = {'normalize_embeddings' : True}
)

In [33]:
huggingface_embeddings.embed_query("Hello world!")

[0.004552533384412527,
 0.17278996109962463,
 0.03477635607123375,
 0.0057495031505823135,
 -0.026328837499022484,
 -0.04085700958967209,
 0.022657906636595726,
 -0.04465891048312187,
 -0.018803752958774567,
 0.008832539431750774,
 0.040252480655908585,
 -0.034709054976701736,
 0.01515104528516531,
 -0.014657329767942429,
 0.07468857616186142,
 -0.04393269494175911,
 -0.056038498878479004,
 0.020306657999753952,
 -0.058129020035266876,
 -0.04611072316765785,
 0.08254802972078323,
 0.10951979458332062,
 0.014446097426116467,
 0.02537935972213745,
 -0.08089366555213928,
 0.015043871477246284,
 -0.003515329211950302,
 0.013029907830059528,
 0.09713467210531235,
 -0.061469580978155136,
 -0.027697762474417686,
 0.001451222226023674,
 0.0835321918129921,
 0.01702164113521576,
 -0.010031119920313358,
 0.08817797899246216,
 0.053562235087156296,
 -0.03907563164830208,
 0.028452137485146523,
 -0.05871637165546417,
 0.024071093648672104,
 -0.03864138200879097,
 -0.04744638875126839,
 0.003006309

In [34]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [69]:
query = "What are the trends in median household income across different states in the united states between 2021 and 2022?"


In [36]:
relevant_documents = vectorstore.similarity_search(query)

In [53]:
retriever = vectorstore.as_retriever(search_type="similarity" , search_kwargs={"k" : 3})

In [None]:
access_token = "****"

In [94]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-v0.1',
    model_kwargs = {"temperature" : 0.1 , "max_length" : 500}
)

In [95]:
query 

'What were the trends in median household income across different states in the United States between 2021 and 2022.'

In [97]:
output = hf.invoke(query)
print(output)

What were the trends in median household income across different states in the United States between 2021 and 2022.

## Introduction

The median household income in the United States was $67,521 in 2021, up from $65,712 in 2020. The median household income in the United States was $67,521 in 2021, up from $65,712 in 2020. The median household income in the United States was $67,521


In [98]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)
     

In [99]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

### USE THE RAG !!! 

In [100]:
result = retrievalQA.invoke({"query" : query})
print(result)

{'query': 'What were the trends in median household income across different states in the United States between 2021 and 2022.', 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nhold income in 2022 was $24,112 \n(Table 1 and Figure 2). Median \nhousehold income was lower than \nthe U.S. median in 30 states and \nPuerto Rico. It was higher than the \nU.S. median in 17 states and the \nDistrict of Columbia. The medians \nfor Arizona, Oregon, and Vermont \nwere not statistically different from \nthe U.S. median.\nFrom 2021 to 2022, five states—\nAlabama, Alaska, Delaware, Florida, \nand Utah—showed a statistically \nsignificant increase in real median \nhousehold income; 17 s

In [104]:
print(result.keys())

dict_keys(['query', 'result', 'source_documents'])


In [102]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census/acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
---------------------------------------------------------------------------------