In [1]:
!pip install openai chromadb



In [27]:
from openai import OpenAI
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
import os 
load_dotenv()
import chromadb

In [29]:
#creating the embedding function
# will help us embed the chopped up chunks 
 openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key = secret, model_name = "text-embedding-3-small")

In [31]:
#now we want to intialize the chroma client, chrome is the vector datbase 
#we do this so that we can actually save the database
#intializing the client with persistence 
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")

#we are setting the name for the collection
#a collection is just the place where all these embeddings go 
collection_name = "document_qa_collection"
#now we are actually creating the collection
collection = chroma_client.get_or_create_collection(name = collection_name, embedding_function = openai_ef)

In [33]:
#method for now we are loading in all the documents from a particular directory 
#this function was previously provided by the instructor 
# Function to load documents from a directory
def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(directory_path, filename), "r", encoding="utf-8"
            ) as file:
                documents.append({"id": filename, "text": file.read()})
    #will load the list of documents
    return documents

In [35]:
#method for nownow splitting it into chunks 
#again this was provided by the instructor unfortunately
#overlap says  the amount of overlaps we have 
#the more overlap we have, the more context will be kept 
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

In [37]:
#actually loading in the documents 
directory_path = "./news_articles"
#loading it in
documents = load_documents_from_directory(directory_path)
#Making sure it loaded in
print(len(documents))

==== Loading documents from directory ====
21


In [39]:
#now we are splitting each document into chunks 
#this was also provided by the instructor - but there are libraries
chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})

==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====


In [41]:
#the function for creating the embeddings 
def get_openai_embedding(text):
    #this step actually embeds each chunk
    #the input/text is that particular chunk, and we are specifiying a certain embedding mode 
    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    #getting the actual embeddings/vectors for each text chunk
    embedding = response.data[0].embedding
    print("==== Generating embeddings... ====")
    #returning the embedding/vector for sertthat particular chunk 
    return embedding

In [43]:
#now we are actually going to the embedding use the above function for the chunks we have previously made 
for doc in chunked_documents:
    #now creating a embedding field for each text chunk 
    doc["embedding"] = get_openai_embedding(doc["text"])

==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embe

In [45]:
#now let us insert each one of these embeddings into a vector database 
for doc in chunked_documents: 
    #adding it to the database 
    collection.upsert(doc["id"], documents = [doc["text"]], embeddings = [doc["embedding"]])

In [47]:
#now creating a function to query the documents that are relevant to the question
#passing in the question, and we want to the two 2 relevant documents 
def query_documents(question, n_results=2):
    #the results will store the relevant documents 
    #the query_texts field takes in the question, the n_results spcifies the amount of documents we want 
    results = collection.query(query_texts=question, n_results=n_results)
    # Extract the relevant chunks
    #results is currently a list of lists, each sublist being the list that contains all the chunks in one document 
    #we want to combine them all into one array, instead of having an array of arrays 
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    return relevant_chunks


In [53]:
#now we want to generate an answer using the question and the relevant documents 
def generate_response(question, relevant_chunks): 
    #combining all the chunks to form a combined context
    #the \n\n is used to seperate the chunks when combined, so that it is easier to read for llms
    context = "\n\n".join(relevant_chunks)
    #this prompt was given by the instructor 
    #again, the \n\n provides clear structure 
    #the prompt can be modified and rewritten
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )
    #creating the response by making the client for the OpenAI chatgpt bot 
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        #this list structures the conversation with the llm
        messages = [
        #sets overall behavior and instructions for the LLM
        {
            "role": "system",
            "content": prompt},
        {
            #represents the user's actual question being asked 
            "role": "user",
            "content": question}])
    #response is the full response object returned.
    #response.choices is the list of all possible completions 
    #response[0] will get the first generation. usually there will only be one completion unless you specify otherwise 
    #.message will refer to the actual message content retrieved 
    answer = response.choices[0].message
    return answer

In [55]:
question = "tell me about databricks"
relevant_chunks = query_documents(question) 
answer = generate_response(question, relevant_chunks)
print(answer)

ChatCompletionMessage(content='Databricks is a cloud-based data analytics platform that provides solutions for big data processing and machine learning. It integrates data engineering, data science, and data analytics into a unified workspace, enabling collaborative data workflows. Recently, Databricks acquired Okera to enhance its data governance capabilities with AI-powered solutions.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)
