## Install and Import Dependencies

In [None]:
%pip install chromadb
%pip install pypdf
%pip install langchain
%pip install sentence-transformers

import chromadb
import json
import pypdf

from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer


## Extract text from PDF

In [8]:
def extract_text_from_pdf(pdf_path):
    
    reader = pypdf.PdfReader(pdf_path)
    
    text = ""
    
    for page in reader.pages:
        text += page.extract_text()
    
    return text

In [None]:
pdf_extraction_results = extract_text_from_pdf([ENTER_PATH_TO_PDF])

print(pdf_extraction_results)

## Chunk extracted text

In [None]:
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [None]:
chunks_ = split_text_into_chunks(pdf_extraction_results)

print(chunks_)

## Generate embeddings

In [None]:
def generate_embeddings(texts, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts).tolist()
    return embeddings

In [None]:
embeddings_ = generate_embeddings(chunks_)

## Add extracted chunks with embeddings to vector database

In [26]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

def populate_chroma_db(chunks, embeddings, pdf_name="my_document"):
    
    collection_name = f"{pdf_name}_collection"

    # Delete existing collection if it exists to ensure a clean start
    try:
        chroma_client.delete_collection(name=collection_name)
    except:
        pass # Collection might not exist
        
    collection = chroma_client.create_collection(name=collection_name, get_or_create=True)

    ids = [f"{pdf_name}_chunk_{i}" for i in range(len(chunks))]
    metadatas = [{"source": pdf_name, "chunk_index": i} for i in range(len(chunks))]

    collection.add(
        documents=chunks,
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )
    print(f"ChromaDB collection '{collection_name}' populated successfully.")
    
    return collection

In [27]:
populate_chroma_db(chunks_, embeddings_)

ChromaDB collection 'my_document_collection' populated successfully.


Collection(name=my_document_collection)

In [None]:
collection = chroma_client.create_collection(name="my_document_collection", get_or_create=True)

results = collection.query(
    query_texts=["USRHDLR"], # Chroma will embed this for you
    n_results=10
)

json_results = json.dumps(results, indent=2)

print(json_results)