# Manual ChromaDB Setup

In [1]:
import chromadb

# Create a persistent client
client = chromadb.PersistentClient(path="./chroma_db")

In [2]:
# Convert PDF to Text
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdf(pdf_path):
    """
    Load a PDF file and convert to text documents
    
    Args:
        pdf_path (str): Path to PDF file
        
    Returns:
        list: List of document pages
    """
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    return pages

def create_chunks(documents, chunk_size=800, chunk_overlap=200):
    """
    Split documents into overlapping chunks
    
    Args:
        documents (list): List of documents to split
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Number of characters to overlap between chunks
        
    Returns:
        list: List of text chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    
    chunks = text_splitter.split_documents(documents)
    return chunks

In [3]:
# Load and Chunk the PDF
pdf_path = "./testing/ft_guide.pdf"
documents = load_pdf(pdf_path)
chunks = create_chunks(documents)

# Create a collection with OpenAI embeddings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# API Key from Env
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


embedding_function = OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-3-small"  # Latest OpenAI embedding model
)

collection = client.create_collection(
    name="pdf_collection",
    embedding_function=embedding_function
)

# Add documents to collection
documents = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
ids = [str(i) for i in range(len(chunks))]

# Add to collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [4]:
# Get Statistics about the collection size
collection.count()


555

In [6]:
# Query Function
results = collection.query(
    query_texts=["google cloud platform virtual machine"],
    n_results=5
)


# Print Results Stylized
print("\nQuery Results:")
print("-" * 50)

for i, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"\nResult {i+1}")
    print(f"Distance Score: {distance:.4f}")  # Show raw distance score
    print(f"Document Content: {doc}")  # Show full document
    print("-" * 50)




Query Results:
--------------------------------------------------

Result 1
Distance Score: 0.9746
Document Content: – Cloud AI API: Offers APIs for NLP tasks such as translation, sentiment analysis, and
entity recognition. These APIs are backed by Google’s powerful infrastructure, ensuring high
performance and reliability.
– Tutorial: This document contains a tutorial for training and deploying an LLM in GCP.
• Hugging Face
– Inference API: This service allows users to deploy and manage LLMs hosted on Hugging
Face’s infrastructure. It supports various models from the Transformers library and provides
an easy-to-use API for integrating these models into applications.
– Spaces: A collaborative environment where users can deploy and share models using Hugging
Face’s hosting platform. It supports deploying custom models and interactive demos.
--------------------------------------------------

Result 2
Distance Score: 1.0330
Document Content: – Azure Machine Learning: Supports the deploy