In [1]:
import os
from urllib.request import urlretrieve
import numpy as np

Prepare the document

In [2]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

Split the document into chunks

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFDirectoryLoader("./us_census/")
docs_before_split = loader.load()
text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]



Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3')

Using the localAI server we can leverage their embedding model to generate embeddings for our documents.

In [4]:
import requests

LOCAL_AI_URL = "http://100.64.227.106:8080"
EMBEDDING_MODEL = "text-embedding-ada-002"

def get_embedding(text: str):
    
    # make http POST request to the embedding endpoint
    response = requests.post(
        f"{LOCAL_AI_URL}/embeddings",
        json={"input": text, "model": EMBEDDING_MODEL},
    )
    # check for HTTP codes other than 200
    response.raise_for_status()
    # extract embeddings from response
    try:
        return response.json()["data"][0]["embedding"]
    except:
        print(response.json())
        return []

# get a little sample of our embeddings
data_embedding = get_embedding(docs_after_split[0].page_content)

In [9]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma


os.environ["OPENAI_API_BASE"] = LOCAL_AI_URL
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

CHROMA_DIR = os.path.join("./chroma")
os.makedirs(CHROMA_DIR, exist_ok=True)

embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma.from_documents(documents=docs_after_split, embedding=embedding, persist_directory=CHROMA_DIR)

In [12]:
from langchain.chains import VectorDBQA

vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embedding)
llm = OpenAI(temperature=0, model_name="gpt-4", openai_api_base=LOCAL_AI_URL)
qa = VectorDBQA.from_chain_type(llm, chain_type="stuff", vectorstore=vectordb)

query = "How many people live in the US with low income? and whay is categorized as low income?"
print(qa.run(query))




The provided context does not contain information about the number of people living in the US with high income.
