In [2]:
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv
from qdrant_client import QdrantClient
import fitz

In [3]:
load_dotenv()

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    return text

def split_text_into_chunks(text, max_tokens=500):
    chunks = []
    words = text.split()
    for i in range(0, len(words), max_tokens):
        chunk = " ".join(words[i:i+max_tokens])
        chunks.append(chunk)
    return chunks

In [8]:
pdf_path = 'C:/Users/user/AIIDE/src/utils/martindale-the-complete-drug-reference.pdf'
extracted_text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(extracted_text)

In [12]:
extracted_text[:1000]

"·\x02 Martindale \nThe Complete Drug Reference \nThirty-eighth Edition \nVolume 1: 1-1120 \nVolume 2: 1121-2314 \nVolume 3: 2315-3444 \nVolume 4: 3445-4596 \n(RP) \nPharmaceutical Press \n·\x02 Martindale \nThe Complete Drug Reference \nThirty-eighth Edition \n(RP) \nPharmaceutical Press \nPublished by Pharmaceutical Press \n1 Lambeth High Street, London SE1 7JN, UK \n©Pharmaceutical Press 2014 \n(RP) is a trade mark of Pharmaceutical Press \nPharmaceutical Press is the publishing division of the Royal Pharmaceutical \nSociety of Great Britain \nFirst edition of Martindale: The Extra Pharmacopoeia was published in 1883. \nSquire's Companion was incorporated in the twenty-third edition in 1952. \nThirty-eighth edition published 2014 \nTypeset by Data Standards Ltd \nPrinted in Italy by LEGO S.p.A. \nISBN 978 0 85711 139 5 \nISSN 0263-5364 \nAll rights reserved. No part of this publication may be \nreproduced, stored in a retrieval system, or transmitted in any \nform or by any means, w

In [10]:
with open("output.txt", "w") as text_file:
    text_file.write(extracted_text)

UnicodeEncodeError: 'charmap' codec can't encode characters in position 62686-62687: character maps to <undefined>

In [None]:
def get_embedding(text):
    response = client.Embedding.create(
        input=text,
        model='text-embedding-ada-002'
    )
    return response['data'][0]['embedding']

In [None]:
chunk_embeddings = [{"text": chunk, "embedding": get_embedding(chunk)} for chunk in chunks]

In [None]:
vectordb_client = QdrantClient(
    url=os.getenv("QDRANT_URL"), 
    api_key=os.getenv("QDRANT_API_KEY")
)

vectordb_client.recreate_collection(
    collection_name="medication_guide",
    vectors_config={"size": 1536, "distance": "Cosine"}
)

In [None]:
# Upload book information to the vectordb

for idx, chunk in enumerate(chunk_embeddings):
    client.upsert(
        collection_name="medical_documents",
        points=[{
            "id": str(idx),
            "vector": chunk["embedding"],
            "payload": {"text": chunk["text"]}
        }]
    )

In [None]:
def query_qdrant(query_text):
    query_embedding = get_embedding(query_text)
    search_result = client.search(
        collection_name="medical_documents",
        query_vector=query_embedding,
        limit=5
    )
    return [hit.payload["text"] for hit in search_result]

# Example query
results = query_qdrant("What is the recommended dosage for medication A?")
for result in results:
    print(result)