* Preparations

In [1]:
%load_ext dotenv
# %reload_ext dotenv
%dotenv ../.env
%dotenv ../.secrets

* Load PDF Document: 'Guide to Air Quality and Health'

In [2]:
from langchain_community.document_loaders import PyPDFLoader

# This file is also provided in the ./service_2/data folder
file_url = "https://www.airnow.gov/sites/default/files/2018-04/aqi_brochure_02_14_0.pdf"
loader = PyPDFLoader(file_url)
docs = loader.load()

article_text = ""
for page in docs:
	article_text += page.page_content + "\n"

In [3]:
print(f"Number of pages: {len(docs)}")
# print(f"Page 1:")
# print(docs[0].model_dump())

Number of pages: 12


* Split 'Guide to Air Quality and Health' into chunks

In [4]:
from langchain_text_splitters  import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000, 
    chunk_overlap=200, 
    length_function = len, 
    add_start_index = True
)
chunks = text_splitter.split_documents(docs)
print(f'Split {len(docs)} documents (pages) into {len(chunks)} chunks.' )

Split 12 documents (pages) into 21 chunks.


In [5]:
chunks[0].page_content
chunk_docs = [item.page_content for item in chunks ]
chunk_docs

['A Guide to Air Quality and \nYour Health\nRecycled/recyclable. Printed with vegetable oil-based \ninks on 100% postconsumer process, chlorine-free \nrecycled paper.\nU.S. Environmental Protection Agency\nOffice of Air Quality Planning and Standards\nOutreach and Information Division\nResearch Triangle Park, NC\nFebruary 2014\nEPA-456/F-14-002',
 '“Local air quality is very \nunhealthy today.” \n“It’s a code red day \nfor ozone.”\n“Particle pollution levels are \nforecast to be unhealthy \nfor sensitive groups.”\nYou may hear these alerts on radio \nor TV or read them in the newspaper. \nBut what do they mean if you: \n\x84\tAre active outdoors?\n\x84\tHave children who play outdoors?\n\x84\tAre an older adult?\n\x84\tHave heart or lung disease?\nThis booklet will help you understand \nhow to find out about air quality in \nyour area and protect your health.\n1',
 'Air Quality Index\nWhy is air quality \nimportant?\nLocal air quality affects how \nyou live and breathe. Like \nthe weat

* Create Embeddings:

In [None]:
from openai import OpenAI
import numpy as np

from dotenv import load_dotenv
import os

load_dotenv('.secrets')


client = OpenAI(base_url='https://k7uffyg03f.execute-api.us-east-1.amazonaws.com/prod/openai/v1', 
                api_key='any value',
                default_headers={"x-api-key": os.getenv('API_GATEWAY_KEY')})

response = client.embeddings.create(
    input = chunk_docs, 
    model = "text-embedding-3-small"
)
#response.data

embeddings = [item.embedding for item in response.data]
ids = [f"id{i}" for i in range(len(chunk_docs))]

In [7]:
print(len(embeddings))
print(len(embeddings[0]))
print(ids)

embeddings_array = np.array(embeddings)
print(embeddings_array)


21
1536
['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id13', 'id14', 'id15', 'id16', 'id17', 'id18', 'id19', 'id20']
[[ 0.02584506  0.0141626   0.01622262 ... -0.00199056 -0.0203562
  -0.01673762]
 [ 0.04857565  0.024065   -0.01668395 ... -0.01253385 -0.01714353
   0.00559497]
 [ 0.01589119 -0.0115022   0.00813478 ... -0.0052813  -0.02109997
   0.0411153 ]
 ...
 [ 0.01006549 -0.00708064  0.00279258 ... -0.00648245 -0.00417819
   0.02419625]
 [ 0.03147809 -0.0247582   0.01532846 ... -0.00229072 -0.01272125
   0.03328466]
 [ 0.02785994 -0.01708201  0.0160398  ... -0.01362493 -0.0232844
   0.0057512 ]]


* Create Collection in ChromaDB with File Persistence:

In [20]:
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from dotenv import load_dotenv
import os

PERSIST_DIR = "./data/chroma_db"

load_dotenv(".env")
load_dotenv(".secrets")
ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME = os.getenv("ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME")

# Create a persistent client (disk)
chroma_client = chromadb.PersistentClient(path=PERSIST_DIR)

# print(f"Deleting collection: '{ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME}'")
# chroma_client.delete_collection(ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME)

print(f"Creating collection: '{ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME}'")

collection = chroma_client.create_collection(
				name = ASSIGNMENT_2__SERVICE_2__CHROMA_DB_COLLECTION_NAME,
				embedding_function = OpenAIEmbeddingFunction(
					api_key = "any value",
					model_name="text-embedding-3-small",
					api_base='https://k7uffyg03f.execute-api.us-east-1.amazonaws.com/prod/openai/v1',
					default_headers={"x-api-key": os.getenv('API_GATEWAY_KEY')}
			))

collection.add(embeddings = embeddings, 
               documents = chunk_docs, 
               ids = ids)

chroma_client.list_collections()


Creating collection: 'guide_to_air_quality_and_health'


[Collection(name=guide_to_air_quality_and_health)]