### Import Modules

In [None]:
from PyPDF2 import PdfReader
import io
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import re
from typing import List, Dict

### Initialize Chroma

In [None]:
chroma_client = chromadb.PersistentClient(path="./data")
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = chroma_client.get_or_create_collection(name="products", embedding_function=embedding_function)

### Helper functions

#### Load PDF

In [None]:
def load_pdf(pdf_file: str, word: int) -> Dict[int, List[str]]:
    reader = PdfReader(pdf_file)    
    documents = {}
    for page_no in range(len(reader.pages)):        
        page = reader.pages[page_no]
        text = page.extract_text() 
        text_chunks = get_text_chunks(text, word)
        documents[page_no] = text_chunks
    return documents

#### Generate Chunks based on each sentence

In [None]:
def get_text_chunks(text: str, word_limit: int) -> List[str]:
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = sentence.split()
        if len(" ".join(current_chunk + words)) <= word_limit:
            current_chunk.extend(words)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = words

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

#### Add the chunks as vectors to ChromaDB

In [None]:
def index_document(pdf_file: str, word: int = 300) -> None:    
    docs = load_pdf(pdf_file, word)    

    docs_strings = []  
    ids = []  
    metadatas = []  
    id = 0  
        
    for page_no in docs.keys():
        for doc in docs[page_no]:        
            docs_strings.append(doc)                        
            metadatas.append({'page_no': page_no,"pdf_name": pdf_file})
            ids.append(id)
            id += 1

    collection.add(
        ids=[str(id) for id in ids],  
        documents=docs_strings,  
        metadatas=metadatas,  
    )    

### Index the PDF

In [None]:
DOC="data/datasheet.pdf"
index_document(DOC)

### Perform semantic search

In [None]:
def search(query):
    vector = embedding_function([query])
    results = collection.query(    
        query_embeddings=vector,
        n_results=5,
        include=["documents"]
    )
    res = " \n".join(str(item) for item in results['documents'][0])
    return res

In [None]:
res=search("Available Memory in Nimbus Book")

In [None]:
print(res)