### Import Modules

In [1]:
from PyPDF2 import PdfReader
import io
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import re
from typing import List, Dict

### Initialize Chroma

In [2]:
chroma_client = chromadb.PersistentClient(path="../data")
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = chroma_client.get_or_create_collection(name="products", embedding_function=embedding_function)



### Helper functions

#### Load PDF

In [3]:
def load_pdf(pdf_file: str, word: int) -> Dict[int, List[str]]:
    reader = PdfReader(pdf_file)    
    documents = {}
    for page_no in range(len(reader.pages)):        
        page = reader.pages[page_no]
        text = page.extract_text() 
        text_chunks = get_text_chunks(text, word)
        documents[page_no] = text_chunks
    return documents

#### Generate Chunks based on each sentence

In [4]:
def get_text_chunks(text: str, word_limit: int) -> List[str]:
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = sentence.split()
        if len(" ".join(current_chunk + words)) <= word_limit:
            current_chunk.extend(words)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = words

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

#### Add the chunks as vectors to ChromaDB

In [5]:
def index_document(pdf_file: str, word: int = 300) -> None:    
    docs = load_pdf(pdf_file, word)    

    docs_strings = []  
    ids = []  
    metadatas = []  
    id = 0  
        
    for page_no in docs.keys():
        for doc in docs[page_no]:        
            docs_strings.append(doc)                        
            metadatas.append({'page_no': page_no,"pdf_name": pdf_file})
            ids.append(id)
            id += 1

    collection.add(
        ids=[str(id) for id in ids],  
        documents=docs_strings,  
        metadatas=metadatas,  
    )    

### Index the PDF

In [6]:
DOC="../data/datasheet.pdf"
index_document(DOC)

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert o

### Perform semantic search

In [7]:
def search(query):
    vector = embedding_function([query])
    results = collection.query(    
        query_embeddings=vector,
        n_results=5,
        include=["documents"]
    )
    res = " \n".join(str(item) for item in results['documents'][0])
    return res

In [8]:
res=search("Aurora X Pro display")

In [9]:
print(res)

Featuring a sleek and modern design, the Aurora X Pro boasts a stunning 6.7 -inch OLED display with a 120Hz refresh rate, providing an immersive viewing experience for videos, games, and more. 
Designed with the user in mind, the Aurora X Pro offers a seamless and intuitive user experience. The device runs on the latest version of the AuroraOS, offering a clean and customizable interface. 
The device is powered by the latest Octa -core processor, ensuring smooth multitasking and exceptional performance, even with the most demanding apps. Beyond its powerful hardware, the Aurora X Pro excels in photography, equipped with a state - of-the-art triple -camera system. 
With advanced AI features, including night mode and portrait enhancements, the Aurora X Pro is your perfect companion for capturing life’s moments. 
Aurora X Pro The Aurora X Pro is a cutting -edge smartphone designed for tech enthusiasts and professionals who demand the best in mobile technology.
