In [1]:
import os
from urllib.request import urlretrieve
import numpy as np

Prepare the document

In [2]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

Split the document into chunks

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFDirectoryLoader("./us_census/")
docs_before_split = loader.load()
text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]



Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3')

Using the localAI server we can leverage their embedding model to generate embeddings for our documents.

In [4]:
import requests

LOCAL_AI_URL = "http://100.64.227.106:8080"
EMBEDDING_MODEL = "text-embedding-ada-002"

def get_embedding(text: str):
    
    # make http POST request to the embedding endpoint
    response = requests.post(
        f"{LOCAL_AI_URL}/embeddings",
        json={"input": text, "model": EMBEDDING_MODEL},
    )
    # check for HTTP codes other than 200
    response.raise_for_status()
    # extract embeddings from response
    try:
        return response.json()["data"][0]["embedding"]
    except:
        print(response.json())
        return []

# get a little sample of our embeddings
data_embedding = get_embedding(docs_after_split[0].page_content)

In [5]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma


os.environ["OPENAI_API_BASE"] = LOCAL_AI_URL
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

CHROMA_DIR = os.path.join("./chroma")
os.makedirs(CHROMA_DIR, exist_ok=True)

embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma.from_documents(documents=docs_after_split, embedding=embedding, persist_directory=CHROMA_DIR)

  warn_deprecated(


In [21]:
from langchain.chains import VectorDBQA

vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding)
llm = OpenAI(temperature=0, model_name="gpt-4", openai_api_base=LOCAL_AI_URL)
qa = VectorDBQA.from_chain_type(llm, chain_type="stuff", vectorstore=vectordb)

query = "What is the survey about?"

from IPython.display import Markdown
# display as markdown
display(Markdown(qa.run(query)))




The survey is about examining changes in public coverage and uninsured rates in the United States from 2021 to 2022, focusing on the differences between Medicaid expansion and non-expansion states. The data is based on the American Community Survey (ACS) samples from January 2021 to December 2021 and January 2022 to December 2022. The survey aims to analyze the impact of Medicaid expansion on public coverage rates and the uninsured rate in different states.

Compare the result using FAISS approach

In [22]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter

FAISS_DIR = os.path.join("./faiss_index")
os.makedirs(FAISS_DIR, exist_ok=True)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30, separator="\n")
docs = text_splitter.split_documents(documents=docs_before_split)

# Create vectors
vectorstore = FAISS.from_documents(docs, embedding)
    # Persist the vectors locally on disk
vectorstore.save_local(FAISS_DIR)

In [24]:
from langchain.chains import RetrievalQA
# Load from local storage
persisted_vectorstore = FAISS.load_local(FAISS_DIR, embedding, allow_dangerous_deserialization=True)

 # Use RetrievalQA chain for orchestration
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=persisted_vectorstore.as_retriever())
result = qa.run(query)

from IPython.display import Markdown
# display as markdown
display(Markdown(result))

The survey is about examining health insurance coverage status and select subtypes (Medicaid, direct purchase, and employer-based) for the 50 states, the District of Columbia, and the 25 most populous metropolitan areas in the United States. It uses the 2021 and 2022 American Community Survey (ACS) 1-year estimates to analyze differences in health insurance coverage and year-to-year changes (2021 to 2022) across these geographies. The survey also considers the impact of demographic shifts, economic changes, and government policy changes on people's access to health coverage.