### Import modules

In [1]:
from PyPDF2 import PdfReader
import io
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import re
from typing import List, Dict

### Define endpoints

In [2]:
EMBEDDINGS_IP="34.142.174.68"
VECTORDB_IP="34.142.174.68"

EMBEDDINGS_PORT="8001"
VECTORDB_PORT="8002"

### Initialize ChromaDB

In [3]:
client = chromadb.HttpClient(host=VECTORDB_IP, port=VECTORDB_PORT, settings=Settings(allow_reset=True, anonymized_telemetry=False))
default_ef = embedding_functions.HuggingFaceEmbeddingServer(url="http://"+ EMBEDDINGS_IP +":"+ EMBEDDINGS_PORT +"/embed")
collection = client.get_or_create_collection(name="baggage_pol",embedding_function=default_ef)

### Helper functions

##### Load PDF and create chunks

In [4]:
def load_pdf(pdf_file: str, word: int) -> Dict[int, List[str]]:
    reader = PdfReader(pdf_file)    
    documents = {}
    for page_no in range(len(reader.pages)):        
        page = reader.pages[page_no]
        text = page.extract_text() 
        text_chunks = get_text_chunks(text, word)
        documents[page_no] = text_chunks
    return documents

##### Create chunks from text

In [5]:
def get_text_chunks(text: str, word_limit: int) -> List[str]:
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = sentence.split()
        if len(" ".join(current_chunk + words)) <= word_limit:
            current_chunk.extend(words)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = words

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

##### Index document

In [6]:
def index_document(pdf_file: str, word: int = 300) -> None:    
    docs = load_pdf(pdf_file, word)    

    docs_strings = []  
    ids = []  
    metadatas = []  
    id = 0  
        
    for page_no in docs.keys():
        for doc in docs[page_no]:        
            docs_strings.append(doc)                        
            metadatas.append({'page_no': page_no,"pdf_name": pdf_file})
            ids.append(id)
            id += 1

    collection.add(
        ids=[str(id) for id in ids],  
        documents=docs_strings,  
        metadatas=metadatas,  
    )    

### Index PDF

In [7]:
DOC="data/baggage_v1.pdf"
index_document(DOC)

### Perform semantic search

In [8]:
def search(query):
    vector = default_ef([query])
    results = collection.query(    
        query_embeddings=vector,
        n_results=2,
        include=["documents"]
    )
    res = " \n".join(str(item) for item in results['documents'][0])
    return res

In [11]:
res=search("execess baggage fee")

In [12]:
print(res)

Excess baggage fees apply for any baggage that exceeds the weight or size limits. The excess baggage fee is 25 EUR per kg for domes8c ﬂights and 20 EUR per kg for interna8onal ﬂights. Addi8onally, oversized baggage fees apply for any baggage that exceeds 203 cm in total dimensions. 
The oversized baggage fee is 45 EUR per bag. Passengers are advised to check the baggage policy and fees on the RedJet website before ﬂying and to pack accordingly. RedJet reserves the right to refuse or charge extra for any baggage that does not comply with the policy.
