In [1]:
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv
from qdrant_client import QdrantClient
import pymupdf4llm

In [2]:
load_dotenv()

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [7]:
def split_text_into_chunks(text, max_tokens=1024):
    chunks = []
    words = text.split()
    for i in range(0, len(words), max_tokens):
        chunk = " ".join(words[i:i+max_tokens])
        chunks.append(chunk)
    return chunks

In [9]:
pdf_paths = ['./knowledge_base/one.pdf', './knowledge_base/two.pdf', './knowledge_base/three.pdf']
for pdf_path in pdf_paths:
    extracted_text = pymupdf4llm.to_markdown(pdf_path)
    chunks = split_text_into_chunks(extracted_text)

Processing ./knowledge_base/one.pdf...
Processing ./knowledge_base/two.pdf...
Processing ./knowledge_base/three.pdf...


In [19]:
extracted_text[-1000:]

'ious respiratory illness caused by influenza viruses. It attacks the respiratory system which includes the nose, the throat and the lungs.|\n|Oxygen|It is a colourless, odourless, tasteless gas essential to living organisms. It is vital for respiration (breathing), which is the process that transfers energy from glucose to cells.|\n|Pneumonia|Bacterial chest infection which affects the tiny air sacs in your lungs, called alveoli.|\n|Pleural effusion|Water on the lungs - especially in the pleura (skin covering the lungs).|\n|Vaccination|Medicine given to reduce risks of getting a disease by working with your bodyâ€™s natural defenses to build protection.|\n|Ventilators|Ventilators are machines that blow air - or air with extra oxygen - into your airways and your lungs. They are often referred to as life support machines and used in intensive care units for patients who cannot breathe on their own.|\n\n\n-----\n\n### Health Promoting Village Project\n\n Japan International Cooperation A

In [27]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [28]:
chunk_embeddings = [{"text": chunk, "embedding": get_embedding(chunk)} for chunk in chunks]

In [3]:
vectordb_client = QdrantClient(
    url=os.getenv("QDRANT_URL"), 
    api_key=os.getenv("QDRANT_API_KEY")
)

# vectordb_client.recreate_collection(
#     collection_name="medication_guide",
#     vectors_config={"size": 1536, "distance": "Cosine"}
# )

### before adding 3 new pdf

In [4]:
def check_number_of_points(vectordb_client, collection_name):
    count = vectordb_client.count(collection_name=collection_name).count
    return count

# Example usage
collection_name = "respiratory_disease_guide"
number_of_points = check_number_of_points(vectordb_client, collection_name)
print(f"The collection '{collection_name}' contains {number_of_points} points.")

The collection 'respiratory_disease_guide' contains 5 points.


In [36]:
# Upload 3 new PDF information to the vectordb
for idx, chunk in enumerate(chunk_embeddings):
    vectordb_client.upsert(
        collection_name="respiratory_disease_guide",
        points=[{
            "id": idx,  # Use the integer directly
            "vector": chunk["embedding"],
            "payload": {"text": chunk["text"]}
        }]
    )

### AFTER adding 3 new pdf

In [40]:
def check_number_of_points(vectordb_client, collection_name):
    count = vectordb_client.count(collection_name=collection_name).count
    return count

# Example usage
collection_name = "respiratory_disease_guide"
number_of_points = check_number_of_points(vectordb_client, collection_name)
print(f"The collection '{collection_name}' contains {number_of_points} points.")

The collection 'respiratory_disease_guide' contains 213 points.


In [None]:
def query_qdrant(query_text):
    query_embedding = get_embedding(query_text)
    search_result = client.search(
        collection_name="medical_documents",
        query_vector=query_embedding,
        limit=5
    )
    return [hit.payload["text"] for hit in search_result]

# Example query
results = query_qdrant("What is the recommended dosage for medication A?")
for result in results:
    print(result)