In [1]:
import os
from urllib.request import urlretrieve
import numpy as np

Prepare the document

In [2]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

Split the document into chunks

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFDirectoryLoader("./us_census/")
docs_before_split = loader.load()
text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]



Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3')

Using the localAI server we can leverage their embedding model to generate embeddings for our documents.

In [4]:
import requests

LOCAL_AI_URL = "http://100.64.227.106:8080"
EMBEDDING_MODEL = "text-embedding-ada-002"

def get_embedding(text: str):
    
    # make http POST request to the embedding endpoint
    response = requests.post(
        f"{LOCAL_AI_URL}/embeddings",
        json={"input": text, "model": EMBEDDING_MODEL},
    )
    # check for HTTP codes other than 200
    response.raise_for_status()
    # extract embeddings from response
    try:
        return response.json()["data"][0]["embedding"]
    except:
        print(response.json())
        return []

# get a little sample of our embeddings
data_embedding = get_embedding(docs_after_split[0].page_content)

In [5]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma


os.environ["OPENAI_API_BASE"] = LOCAL_AI_URL
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

CHROMA_DIR = os.path.join("./chroma")
os.makedirs(CHROMA_DIR, exist_ok=True)

embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma.from_documents(documents=docs_after_split, embedding=embedding, persist_directory=CHROMA_DIR)

  warn_deprecated(


In [6]:
from langchain.chains import VectorDBQA

vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding)
llm = OpenAI(temperature=0, model_name="gpt-4", openai_api_base=LOCAL_AI_URL)
qa = VectorDBQA.from_chain_type(llm, chain_type="stuff", vectorstore=vectordb)

query = "What is the survey about?"

from IPython.display import Markdown
# display as markdown
display(Markdown(qa.run(query)))


  warn_deprecated(
  warn_deprecated(


The survey is about examining changes in public coverage and uninsured rates in the United States from 2021 to 2022, focusing on the differences between Medicaid expansion and non-expansion states. The data is based on the American Community Survey (ACS) samples from January 2021 to December 2021 and January 2022 to December 2022. The survey aims to analyze the impact of Medicaid expansion on public coverage rates and the uninsured rate in different states.

Compare the result using FAISS approach

In [7]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter

FAISS_DIR = os.path.join("./faiss_index")
os.makedirs(FAISS_DIR, exist_ok=True)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n")
docs = text_splitter.split_documents(documents=docs_before_split)

# Create vectors
vectorstore = FAISS.from_documents(docs, embedding)
    # Persist the vectors locally on disk
vectorstore.save_local(FAISS_DIR)

In [8]:
from langchain.chains import RetrievalQA
# Load from local storage
persisted_vectorstore = FAISS.load_local(FAISS_DIR, embedding, allow_dangerous_deserialization=True)

 # Use RetrievalQA chain for orchestration
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=persisted_vectorstore.as_retriever())

query2 = "give me a brief summary of the paper, and note key take aways and key findings"
result = qa.run(query2)

from IPython.display import Markdown
# display as markdown
display(Markdown(result))

This paper examines health insurance coverage status and types in the United States during 2021 and 2022, using the American Community Survey (ACS) 1-year estimates. It focuses on the differences in coverage status and select subtypes (Medicaid, direct purchase, and employer-based) for the 50 states, the District of Columbia, and the 25 most populous metropolitan areas. The paper highlights the impact of national policies, such as the renewal of the Public Health Emergency and the American Rescue Plan (ARP), as well as state-level policies on health insurance coverage. Key takeaways and findings include:

1. The labor market's improvement between 2021 and 2022 may have affected private coverage in the United States.
2. Public policy changes, such as the renewal of the Public Health Emergency and the ARP, enhanced Marketplace premium subsidies.
3. Missouri and Oklahoma expanded Medicaid eligibility under the Patient Protection and Affordable Care Act (ACA) in 2022, leaving only twelve states without expanded Medicaid eligibility.
4. Kentucky, Maine, and New Mexico created state-based health insurance marketplaces in November 2021.
5. State and federal policies designed to increase public coverage may affect the supply and demand for private coverage, leading to various changes in coverage rates.

Store the embedding in pgvector for later use

In [9]:
from langchain_core.documents import Document
from langchain.vectorstores.pgvector import PGVector
from langchain.vectorstores.pgvector import DistanceStrategy

# See docker command above to launch a postgres instance with pgvector enabled.
PG_USER = "pguser"
PG_PASS = "pgpass"
PG_HOST = "localhost"
PG_PORT = "5432"
PG_DB_NAME = "langchain"
CONNECTION_STRING = f"postgresql+psycopg://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB_NAME}"  # Uses psycopg3!
collection_name = "rag_us_census"

db = PGVector.from_documents(
    documents= docs_after_split,
    embedding = embedding,
    collection_name= collection_name,
    distance_strategy = DistanceStrategy.COSINE,
    connection_string=CONNECTION_STRING)

  warn_deprecated(


In [10]:
from langchain.schema import Document

#Fetch the k=3 most similar documents
docs =  db.similarity_search(query, k=3)

doc = docs[0]
# Access the document's content
doc_content = doc.page_content
# Access the document's metadata object
doc_metadata = doc.metadata

print("Content snippet:" + doc_content[:500])
    

Content snippet:3
In addition to national policies, individual states and the District of Columbia can affect health insurance coverage by making Marketplace or Medicaid more accessible and affordable. This variation may be more or less pronounced across states. Missouri and 
Oklahoma expanded Medicaid eligibility under the 
Patient Protection and Affordable Care Act (ACA) in 2022, leaving only twelve states without expanded Medicaid eligibility, primarily in the South and parts 
1 The Bureau of Labor Statistic


In [11]:
from langchain.chains import RetrievalQA
from IPython.display import Markdown, display

retriever = db.as_retriever(
    search_kwargs={"k": 3}
)

qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=True,
)

response = qa_stuff.run(query)
display(Markdown(response))
response = qa_stuff.run(query2)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The survey is about the changes in health insurance coverage status and type in the United States from 2021 to 2022, focusing on how demographic shifts, economic factors, and government policies have affected access to health coverage during that time period.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


This paper examines the impact of demographic shifts, economic changes, and government policies on health insurance coverage in the United States between 2021 and 2022. Key takeaways and findings include:

1. National policies, as well as individual state policies, can affect health insurance coverage through measures like expanding Medicaid eligibility and creating state-based health insurance marketplaces.
2. The labor market's improvement between 2021 and 2022 may have influenced private coverage in the United States during that time.
3. Public policy changes, such as the renewal of the Public Health Emergency and the American Rescue Plan (ARP) enhancing Marketplace premium subsidies, have also affected health insurance coverage.
4. There is a variety of changes in coverage rates due to these factors, and further research and analysis are needed to understand the full impact of these changes on the U.S. healthcare system.